From 248e23b50e2da0753f3b5faa068939cbe9f8a75a Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Feb 2017 11:26:33 +0100 Subject: [PATCH 01/69] batman-adv: Fix double free during fragment merge error The function batadv_frag_skb_buffer was supposed not to consume the skbuff on errors. This was followed in the helper function batadv_frag_insert_packet when the skb would potentially be inserted in the fragment queue. But it could happen that the next helper function batadv_frag_merge_packets would try to merge the fragments and fail. This results in a kfree_skb of all the enqueued fragments (including the just inserted one). batadv_recv_frag_packet would detect the error in batadv_frag_skb_buffer and try to free the skb again. The behavior of batadv_frag_skb_buffer (and its helper batadv_frag_insert_packet) must therefore be changed to always consume the skbuff to have a common behavior and avoid the double kfree_skb. Fixes: 610bfc6bc99b ("batman-adv: Receive fragmented packets and merge") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0854ebd8613e9b..31e97e9aee0d54 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -239,8 +239,10 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, spin_unlock_bh(&chain->lock); err: - if (!ret) + if (!ret) { kfree(frag_entry_new); + kfree_skb(skb); + } return ret; } @@ -313,7 +315,7 @@ batadv_frag_merge_packets(struct hlist_head *chain) * * There are three possible outcomes: 1) Packet is merged: Return true and * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb - * to NULL; 3) Error: Return false and leave skb as is. + * to NULL; 3) Error: Return false and free skb. * * Return: true when packet is merged or buffered, false when skb is not not * used. @@ -338,9 +340,9 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, goto out_err; out: - *skb = skb_out; ret = true; out_err: + *skb = skb_out; return ret; } From 51c6b429c0c95e67edd1cb0b548c5cf6a6604763 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Mon, 13 Feb 2017 20:44:31 +0100 Subject: [PATCH 02/69] batman-adv: Fix transmission of final, 16th fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to split and transmit a unicast packet in 16 parts will fail for the final fragment: After having sent the 15th one with a frag_packet.no index of 14, we will increase the the index to 15 - and return with an error code immediately, even though one more fragment is due for transmission and allowed. Fixing this issue by moving the check before incrementing the index. While at it, adding an unlikely(), because the check is actually more of an assertion. Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 31e97e9aee0d54..11149e5be4e0ef 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -501,6 +501,12 @@ int batadv_frag_send_packet(struct sk_buff *skb, /* Eat and send fragments from the tail of skb */ while (skb->len > max_fragment_size) { + /* The initial check in this function should cover this case */ + if (unlikely(frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)) { + ret = -EINVAL; + goto put_primary_if; + } + skb_fragment = batadv_frag_create(skb, &frag_header, mtu); if (!skb_fragment) { ret = -ENOMEM; @@ -517,12 +523,6 @@ int batadv_frag_send_packet(struct sk_buff *skb, } frag_header.no++; - - /* The initial check in this function should cover this case */ - if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) { - ret = -EINVAL; - goto put_primary_if; - } } /* Make room for the fragment header. */ From 0328edc77d4f35014b35f32b46be0a7e16aae74f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Feb 2017 08:59:16 +0100 Subject: [PATCH 03/69] mac80211: fix packet statistics for fast-RX When adding per-CPU statistics, which added statistics back to mac80211 for the fast-RX path, I evidently forgot to add the "stats->packets++" line. The reason for that is likely that I didn't see it since it's done in defragmentation for the regular RX path. Add the missing line to properly count received packets in the fast-RX case. Fixes: c9c5962b56c1 ("mac80211: enable collecting station statistics per-CPU") Reported-by: Oren Givon Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 50ca3828b1242e..a8443d8bc23323 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3880,6 +3880,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, stats->last_rate = sta_stats_encode_rate(status); stats->fragments++; + stats->packets++; if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; From a9e9200d8661c1a0be8c39f93deb383dc940de35 Mon Sep 17 00:00:00 2001 From: Matt Chen Date: Sun, 22 Jan 2017 02:16:58 +0800 Subject: [PATCH 04/69] mac80211: flush delayed work when entering suspend The issue was found when entering suspend and resume. It triggers a warning in: mac80211/key.c: ieee80211_enable_keys() ... WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || sdata->crypto_tx_tailroom_pending_dec); ... It points out sdata->crypto_tx_tailroom_pending_dec isn't cleaned up successfully in a delayed_work during suspend. Add a flush_delayed_work to fix it. Cc: stable@vger.kernel.org Signed-off-by: Matt Chen Signed-off-by: Johannes Berg --- net/mac80211/pm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 28a3a0957c9e35..76a8bcd8ef1123 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) break; } + flush_delayed_work(&sdata->dec_tailroom_needed_wk); drv_remove_interface(local, sdata); } From b7540d8f25c8034de7e4163fc23ac457bf057731 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 6 Feb 2017 15:28:42 +0200 Subject: [PATCH 05/69] mac80211: don't reorder frames with SN smaller than SSN When RX aggregation starts, transmitter may continue send frames with SN smaller than SSN until the AddBA response is received. However, the reorder buffer is already initialized at this point, which will cause the drop of such frames as duplicates since the head SN of the reorder buffer is set to the SSN, which is bigger. Cc: stable@vger.kernel.org Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 1 + net/mac80211/rx.c | 14 +++++++++++++- net/mac80211/sta_info.h | 6 ++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3b5fd4188f2ac7..58ad23a441097c 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -398,6 +398,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; tid_agg_rx->auto_seq = auto_seq; + tid_agg_rx->started = false; tid_agg_rx->reorder_buf_filtered = 0; status = WLAN_STATUS_SUCCESS; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a8443d8bc23323..28cc494a774d0e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * Copyright(c) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1034,6 +1034,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; + /* + * If the current MPDU's SN is smaller than the SSN, it shouldn't + * be reordered. + */ + if (unlikely(!tid_agg_rx->started)) { + if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { + ret = false; + goto out; + } + tid_agg_rx->started = true; + } + /* frame with out of date sequence number */ if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index dd06ef0b886145..15599c70a38fc9 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -189,6 +189,7 @@ struct tid_ampdu_tx { * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. * @removed: this session is removed (but might have been found due to RCU) + * @started: this session has started (head ssn or higher was received) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -212,8 +213,9 @@ struct tid_ampdu_rx { u16 ssn; u16 buf_size; u16 timeout; - bool auto_seq; - bool removed; + u8 auto_seq:1, + removed:1, + started:1; }; /** From 8fbcfeb8a9cc803464d6c166e7991913711c612c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Feb 2017 10:27:37 +0000 Subject: [PATCH 06/69] mac80211_hwsim: Replace bogus hrtimer clockid mac80211_hwsim initializes a hrtimer with clockid CLOCK_MONOTONIC_RAW. That's not supported. Use CLOCK_MONOTONIC instead. Signed-off-by: Thomas Gleixner Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1620a5d2757d38..0889fc81ce9e47 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2671,7 +2671,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, tasklet_hrtimer_init(&data->beacon_timer, mac80211_hwsim_beacon, - CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS); + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); spin_lock_bh(&hwsim_radio_lock); list_add_tail(&data->list, &hwsim_radios); From 890030d3c425f49abaa4acf60e20f288b599f980 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 22 Feb 2017 16:16:07 +0100 Subject: [PATCH 07/69] mac80211: don't handle filtered frames within a BA session When running a BA session, the driver (or the hardware) already takes care of retransmitting failed frames, since it has to keep the receiver reorder window in sync. Adding another layer of retransmit around that does not improve anything. In fact, it can only lead to some strong reordering with huge latency. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/status.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a3af6e1bfd984d..05ccd55b5d83d4 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct ieee80211_hdr *hdr = (void *)skb->data; int ac; - if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { + if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_CTL_AMPDU)) { ieee80211_free_txskb(&local->hw, skb); return; } From d98937f4ea713d21e0fcc345919f86c877dd8d6f Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Feb 2017 14:24:36 +0100 Subject: [PATCH 08/69] mac80211: fix power saving clients handling in iwlwifi iwlwifi now supports RSS and can't let mac80211 track the PS state based on the Rx frames since they can come out of order. iwlwifi is now advertising AP_LINK_PS, and uses explicit notifications to teach mac80211 about the PS state of the stations and the PS poll / uAPSD trigger frames coming our way from the peers. Because of that, the TIM stopped being maintained in mac80211. I tried to fix this in commit c68df2e7be0c ("mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE") but that was later reverted by Felix in commit 6c18a6b4e799 ("Revert "mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE") since it broke drivers that do not implement set_tim. Since none of the drivers that set AP_LINK_PS have the set_tim() handler set besides iwlwifi, I can bail out in __sta_info_recalc_tim if AP_LINK_PS AND .set_tim is not implemented. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 4774e663a4112f..8bb99d299cdaa9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) From 2595d259b667114431501bae51b45d6656b987d1 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 20 Feb 2017 14:24:39 +0100 Subject: [PATCH 09/69] mac80211: shorten debug message Tracing is limited to 100 characters and this message passes the limit when there are a few buffered frames. Shorten it. Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8bb99d299cdaa9..3323a2fb289bd0 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) sta_info_recalc_tim(sta); ps_dbg(sdata, - "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", + "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); From 09e0a2fe102208cbaf39510b8b04dd524d7d2935 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 20 Feb 2017 14:24:38 +0100 Subject: [PATCH 10/69] mac80211: fix typo in debug print Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 58ad23a441097c..4456559cb056d1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, ht_dbg(sta->sdata, "Rx BA session stop requested for %pM tid %u %s reason: %d\n", sta->sta.addr, tid, - initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", + initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator", (int)reason); if (drv_ampdu_action(local, sta->sdata, ¶ms)) From ff4dd73dd2b4806419f8ff65cbce11d5019548d0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 27 Feb 2017 17:15:28 +0100 Subject: [PATCH 11/69] mac80211_hwsim: check HWSIM_ATTR_RADIO_NAME length Unfortunately, the nla policy was defined to have HWSIM_ATTR_RADIO_NAME as an NLA_STRING, rather than NLA_NUL_STRING, so we can't use it as a NUL-terminated string in the kernel. Rather than break the API, kasprintf() the string to a new buffer to guarantee NUL termination. Reported-by: Andrew Zaborowski Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 28 ++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 0889fc81ce9e47..50c219fb1a52b9 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3056,6 +3056,7 @@ static int hwsim_register_received_nl(struct sk_buff *skb_2, static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_new_radio_params param = { 0 }; + const char *hwname = NULL; param.reg_strict = info->attrs[HWSIM_ATTR_REG_STRICT_REG]; param.p2p_device = info->attrs[HWSIM_ATTR_SUPPORT_P2P_DEVICE]; @@ -3069,8 +3070,14 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; - if (info->attrs[HWSIM_ATTR_RADIO_NAME]) - param.hwname = nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]); + if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { + hwname = kasprintf(GFP_KERNEL, "%.*s", + nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), + (char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME])); + if (!hwname) + return -ENOMEM; + param.hwname = hwname; + } if (info->attrs[HWSIM_ATTR_USE_CHANCTX]) param.use_chanctx = true; @@ -3098,11 +3105,15 @@ static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) s64 idx = -1; const char *hwname = NULL; - if (info->attrs[HWSIM_ATTR_RADIO_ID]) + if (info->attrs[HWSIM_ATTR_RADIO_ID]) { idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]); - else if (info->attrs[HWSIM_ATTR_RADIO_NAME]) - hwname = (void *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]); - else + } else if (info->attrs[HWSIM_ATTR_RADIO_NAME]) { + hwname = kasprintf(GFP_KERNEL, "%.*s", + nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]), + (char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME])); + if (!hwname) + return -ENOMEM; + } else return -EINVAL; spin_lock_bh(&hwsim_radio_lock); @@ -3111,7 +3122,8 @@ static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) if (data->idx != idx) continue; } else { - if (strcmp(hwname, wiphy_name(data->hw->wiphy))) + if (!hwname || + strcmp(hwname, wiphy_name(data->hw->wiphy))) continue; } @@ -3122,10 +3134,12 @@ static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) spin_unlock_bh(&hwsim_radio_lock); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), info); + kfree(hwname); return 0; } spin_unlock_bh(&hwsim_radio_lock); + kfree(hwname); return -ENODEV; } From 19d19e960598161be92a7e4828eb7706c6410ce6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 27 Feb 2017 09:38:11 +0100 Subject: [PATCH 12/69] mac80211: use driver-indicated transmitter STA only for data frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I originally introduced using the driver-indicated station as an optimisation to avoid the hashtable lookup/iteration, of course it wasn't intended to really functionally change anything. I neglected, however, to take into account VLAN interfaces, which have the property that management and data frames are handled differently: data frames go directly to the station and the VLAN while management frames continue to be processed over the underlying/associated AP-type interface. As a consequence, when a driver used this optimisation for management frames and the user enabled VLANs, my change broke things since any management frames, particularly disassoc/deauth, were missed by hostapd. Fix this by restoring the original code path for non-data frames, they aren't critical for performance to begin with. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=194713. Big thanks goes to Jarek who bisected the issue and provided a very detailed bug report, including the crucial information that he was using VLANs in his configuration. Cc: stable@vger.kernel.org Fixes: 771e846bea9e ("mac80211: allow passing transmitter station on RX") Reported-and-tested-by: Jarek Kamiński Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 28cc494a774d0e..e48724a6725e32 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4086,15 +4086,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_is_beacon(hdr->frame_control))) ieee80211_scan_rx(local, skb); - if (pubsta) { - rx.sta = container_of(pubsta, struct sta_info, sta); - rx.sdata = rx.sta->sdata; - if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) - return; - goto out; - } else if (ieee80211_is_data(fc)) { + if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; + if (pubsta) { + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + goto out; + } + prev_sta = NULL; for_each_sta_info(local, hdr->addr2, sta, tmp) { From 29e09229d9f26129a39462fae0ddabc4d9533989 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Feb 2017 08:39:28 +0100 Subject: [PATCH 13/69] netfilter: use skb_to_full_sk in ip_route_me_harder inet_sk(skb->sk) is illegal in case skb is attached to request socket. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Reported by: Daniel J Blueman Signed-off-by: Florian Westphal Tested-by: Daniel J Blueman Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index b3cc1335adbc1a..c0cc6aa8cfaa9c 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; - __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + const struct sock *sk = skb_to_full_sk(skb); + __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0; struct net_device *dev = skb_dst(skb)->dev; unsigned int hh_len; @@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t fl4.daddr = iph->daddr; fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; if (!fl4.flowi4_oif) fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; @@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); - dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_set(skb, dst); From 540b1c48c37ac0ad66212004db21e1ff7e2d78be Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Feb 2017 15:43:06 +0000 Subject: [PATCH 14/69] rxrpc: Fix deadlock between call creation and sendmsg/recvmsg All the routines by which rxrpc is accessed from the outside are serialised by means of the socket lock (sendmsg, recvmsg, bind, rxrpc_kernel_begin_call(), ...) and this presents a problem: (1) If a number of calls on the same socket are in the process of connection to the same peer, a maximum of four concurrent live calls are permitted before further calls need to wait for a slot. (2) If a call is waiting for a slot, it is deep inside sendmsg() or rxrpc_kernel_begin_call() and the entry function is holding the socket lock. (3) sendmsg() and recvmsg() or the in-kernel equivalents are prevented from servicing the other calls as they need to take the socket lock to do so. (4) The socket is stuck until a call is aborted and makes its slot available to the waiter. Fix this by: (1) Provide each call with a mutex ('user_mutex') that arbitrates access by the users of rxrpc separately for each specific call. (2) Make rxrpc_sendmsg() and rxrpc_recvmsg() unlock the socket as soon as they've got a call and taken its mutex. Note that I'm returning EWOULDBLOCK from recvmsg() if MSG_DONTWAIT is set but someone else has the lock. Should I instead only return EWOULDBLOCK if there's nothing currently to be done on a socket, and sleep in this particular instance because there is something to be done, but we appear to be blocked by the interrupt handler doing its ping? (3) Make rxrpc_new_client_call() unlock the socket after allocating a new call, locking its user mutex and adding it to the socket's call tree. The call is returned locked so that sendmsg() can add data to it immediately. From the moment the call is in the socket tree, it is subject to access by sendmsg() and recvmsg() - even if it isn't connected yet. (4) Lock new service calls in the UDP data_ready handler (in rxrpc_new_incoming_call()) because they may already be in the socket's tree and the data_ready handler makes them live immediately if a user ID has already been preassigned. Note that the new call is locked before any notifications are sent that it is live, so doing mutex_trylock() *ought* to always succeed. Userspace is prevented from doing sendmsg() on calls that are in a too-early state in rxrpc_do_sendmsg(). (5) Make rxrpc_new_incoming_call() return the call with the user mutex held so that a ping can be scheduled immediately under it. Note that it might be worth moving the ping call into rxrpc_new_incoming_call() and then we can drop the mutex there. (6) Make rxrpc_accept_call() take the lock on the call it is accepting and release the socket after adding the call to the socket's tree. This is slightly tricky as we've dequeued the call by that point and have to requeue it. Note that requeuing emits a trace event. (7) Make rxrpc_kernel_send_data() and rxrpc_kernel_recv_data() take the new mutex immediately and don't bother with the socket mutex at all. This patch has the nice bonus that calls on the same socket are now to some extent parallelisable. Note that we might want to move rxrpc_service_prealloc() calls out from the socket lock and give it its own lock, so that we don't hang progress in other calls because we're waiting for the allocator. We probably also want to avoid calling rxrpc_notify_socket() from within the socket lock (rxrpc_accept_call()). Signed-off-by: David Howells Tested-by: Marc Dionne Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 12 ++++++-- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_accept.c | 48 ++++++++++++++++++++++++++++++ net/rxrpc/call_object.c | 18 ++++++++++-- net/rxrpc/input.c | 1 + net/rxrpc/recvmsg.c | 39 ++++++++++++++++++++---- net/rxrpc/sendmsg.c | 57 +++++++++++++++++++++++++++++------- 8 files changed, 156 insertions(+), 22 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 593f586545eba9..39123c06a56613 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -119,6 +119,7 @@ enum rxrpc_recvmsg_trace { rxrpc_recvmsg_full, rxrpc_recvmsg_hole, rxrpc_recvmsg_next, + rxrpc_recvmsg_requeue, rxrpc_recvmsg_return, rxrpc_recvmsg_terminal, rxrpc_recvmsg_to_be_accepted, @@ -277,6 +278,7 @@ enum rxrpc_congest_change { EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 199b46e93e64ee..7fb59c3f1542af 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -290,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); + /* The socket has been unlocked. */ if (!IS_ERR(call)) call->notify_rx = notify_rx; - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); _leave(" = %p", call); return call; } @@ -310,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call); void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + + mutex_lock(&call->user_mutex); rxrpc_release_call(rxrpc_sk(sock->sk), call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_end_call); @@ -450,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: ret = rxrpc_do_sendmsg(rx, m, len); - break; + /* The socket has been unlocked */ + goto out; default: ret = -EINVAL; - break; + goto error_unlock; } error_unlock: release_sock(&rx->sk); +out: _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 12be432be9b2fe..26a7b1db1361e5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -467,6 +467,7 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ + struct mutex user_mutex; /* User access mutex */ ktime_t ack_at; /* When deferred ACK needs to happen */ ktime_t resend_at; /* When next resend needs to happen */ ktime_t ping_at; /* When next to send a ping */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 7c4c64ab8da2e2..0ed181f53f32a0 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * * If we want to report an error, we mark the skb with the packet type and * abort code and return NULL. + * + * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, @@ -371,6 +373,18 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); + /* Lock the call to prevent rxrpc_kernel_send/recv_data() and + * sendmsg()/recvmsg() inconveniently stealing the mutex once the + * notification is generated. + * + * The BUG should never happen because the kernel should be well + * behaved enough not to access the call before the first notification + * event and userspace is prevented from doing so until the state is + * appropriate. + */ + if (!mutex_trylock(&call->user_mutex)) + BUG(); + /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -429,10 +443,12 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, /* * handle acceptance of a call by userspace * - assign the user call ID to the call at the front of the queue + * - called with the socket locked. */ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, unsigned long user_call_ID, rxrpc_notify_rx_t notify_rx) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -446,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, if (list_empty(&rx->to_be_accepted)) { write_unlock(&rx->call_lock); + release_sock(&rx->sk); kleave(" = -ENODATA [empty]"); return ERR_PTR(-ENODATA); } @@ -470,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, */ call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); + write_unlock(&rx->call_lock); + + /* We need to gain the mutex from the interrupt handler without + * upsetting lockdep, so we have to release it there and take it here. + * We are, however, still holding the socket lock, so other accepts + * must wait for us and no one can add the user ID behind our backs. + */ + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + release_sock(&rx->sk); + kleave(" = -ERESTARTSYS"); + return ERR_PTR(-ERESTARTSYS); + } + + write_lock(&rx->call_lock); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); rxrpc_see_call(call); + /* Find the user ID insertion point. */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + BUG(); + } + write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: @@ -499,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxrpc_notify_socket(call); rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %p{%d}", call, call->debug_id); return call; @@ -515,6 +562,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); out: rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %d", ret); return ERR_PTR(ret); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8b94db3c9b2ecb..d79cd36987a95b 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -115,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) if (!call->rxtx_annotations) goto nomem_2; + mutex_init(&call->user_mutex); setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -194,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) } /* - * set up a call for the given data - * - called in process context with IRQs enabled + * Set up a call for the given parameters. + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, unsigned long user_call_ID, gfp_t gfp) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; @@ -212,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, call = rxrpc_alloc_client_call(srx, gfp); if (IS_ERR(call)) { + release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); return call; } @@ -219,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), here, (const void *)user_call_ID); + /* We need to protect a partially set up call against the user as we + * will be acting outside the socket lock. + */ + mutex_lock(&call->user_mutex); + /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); @@ -250,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock(&rxrpc_call_lock); + /* From this point on, the call is protected by its own lock. */ + release_sock(&rx->sk); + /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ @@ -279,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, */ error_dup_user_ID: write_unlock(&rx->call_lock); + release_sock(&rx->sk); ret = -EEXIST; error: @@ -287,6 +300,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ERR_PTR(ret); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 78ec33477adf6c..9f4cfa25af7c92 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1194,6 +1194,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reject_packet; } rxrpc_send_ping(call, skb, skew); + mutex_unlock(&call->user_mutex); } rxrpc_input_call_packet(call, skb, skew); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index f3a688e108430a..22447dbcc38021 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -487,6 +487,20 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); + /* We're going to drop the socket lock, so we need to lock the call + * against interference by sendmsg. + */ + if (!mutex_trylock(&call->user_mutex)) { + ret = -EWOULDBLOCK; + if (flags & MSG_DONTWAIT) + goto error_requeue_call; + ret = -ERESTARTSYS; + if (mutex_lock_interruptible(&call->user_mutex) < 0) + goto error_requeue_call; + } + + release_sock(&rx->sk); + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -502,7 +516,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, &call->user_call_ID); } if (ret < 0) - goto error; + goto error_unlock_call; } if (msg->msg_name) { @@ -533,12 +547,12 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } if (ret < 0) - goto error; + goto error_unlock_call; if (call->state == RXRPC_CALL_COMPLETE) { ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) - goto error; + goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; @@ -551,8 +565,21 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_flags &= ~MSG_MORE; ret = copied; -error: +error_unlock_call: + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); + return ret; + +error_requeue_call: + if (!(flags & MSG_PEEK)) { + write_lock_bh(&rx->recvmsg_lock); + list_add(&call->recvmsg_link, &rx->recvmsg_q); + write_unlock_bh(&rx->recvmsg_lock); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0); + } else { + rxrpc_put_call(call, rxrpc_call_put); + } error_no_call: release_sock(&rx->sk); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); @@ -609,7 +636,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, iov.iov_len = size - *_offset; iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: @@ -648,7 +675,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); return ret; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0a6ef217aa8ada..31c1538c1a8de6 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -59,9 +59,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, } trace_rxrpc_transmit(call, rxrpc_transmit_wait); - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); *timeo = schedule_timeout(*timeo); - lock_sock(&rx->sk); + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + ret = sock_intr_errno(*timeo); + break; + } } remove_wait_queue(&call->waitq, &myself); @@ -171,7 +174,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, /* * send data through a socket * - must be called in process context - * - caller holds the socket locked + * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -437,10 +440,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, /* * Create a new client call for sendmsg(). + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID, bool exclusive) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -450,8 +456,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, _enter(""); - if (!msg->msg_name) + if (!msg->msg_name) { + release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); + } key = rx->key; if (key && !rx->key->payload.data[0]) @@ -464,6 +472,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | exclusive; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + /* The socket is now unlocked */ _leave(" = %p\n", call); return call; @@ -475,6 +484,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) + __releases(&rx->sk.sk_lock.slock) { enum rxrpc_command cmd; struct rxrpc_call *call; @@ -488,12 +498,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, &exclusive); if (ret < 0) - return ret; + goto error_release_sock; if (cmd == RXRPC_CMD_ACCEPT) { + ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) - return -EINVAL; + goto error_release_sock; call = rxrpc_accept_call(rx, user_call_ID, NULL); + /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); rxrpc_put_call(call, rxrpc_call_put); @@ -502,12 +514,29 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) call = rxrpc_find_call_by_user_ID(rx, user_call_ID); if (!call) { + ret = -EBADSLT; if (cmd != RXRPC_CMD_SEND_DATA) - return -EBADSLT; + goto error_release_sock; + ret = -EBUSY; + if (call->state == RXRPC_CALL_UNINITIALISED || + call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || + call->state == RXRPC_CALL_SERVER_PREALLOC || + call->state == RXRPC_CALL_SERVER_SECURING || + call->state == RXRPC_CALL_SERVER_ACCEPTING) + goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, exclusive); + /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); + /* ... and we have the call lock. */ + } else { + ret = mutex_lock_interruptible(&call->user_mutex); + release_sock(&rx->sk); + if (ret < 0) { + ret = -ERESTARTSYS; + goto error_put; + } } _debug("CALL %d USR %lx ST %d on CONN %p", @@ -535,9 +564,15 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len); } + mutex_unlock(&call->user_mutex); +error_put: rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; } /** @@ -562,7 +597,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); @@ -577,7 +612,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); } - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } @@ -598,12 +633,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, { _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); if (rxrpc_abort_call(why, call, 0, abort_code, error)) rxrpc_send_abort_packet(call); - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(""); } From 5179b26694c92373275e4933f5d0ff32d585c675 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 28 Feb 2017 12:41:29 +0800 Subject: [PATCH 15/69] sctp: call rcu_read_lock before checking for duplicate transport nodes Commit cd2b70875058 ("sctp: check duplicate node before inserting a new transport") called rhltable_lookup() to check for the duplicate transport node in transport rhashtable. But rhltable_lookup() doesn't call rcu_read_lock inside, it could cause a use-after-free issue if it tries to dereference the node that another cpu has freed it. Note that sock lock can not avoid this as it is per sock. This patch is to fix it by calling rcu_read_lock before checking for duplicate transport nodes. Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport") Reported-by: Andrey Konovalov Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/input.c b/net/sctp/input.c index fc458968fe4bd8..2a28ab20487f03 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -884,14 +884,17 @@ int sctp_hash_transport(struct sctp_transport *t) arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); + rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { + rcu_read_unlock(); err = -EEXIST; goto out; } + rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); From 4f7bfb3982e02aacc5fb6e1a121e5326c1778ac3 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 28 Feb 2017 01:45:40 -0500 Subject: [PATCH 16/69] rds: ib: add the static type to the variables The variables rds_ib_mr_1m_pool_size and rds_ib_mr_8k_pool_size are used only in the ib.c file. As such, the static type is added to limit them in this file. Cc: Joe Jin Cc: Junxiao Bi Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 4 ++-- net/rds/ib_mr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 91fe46f1e4ccf0..0f557b24331121 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -45,8 +45,8 @@ #include "ib.h" #include "ib_mr.h" -unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; -unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; +static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(rds_ib_mr_1m_pool_size, int, 0444); diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 24c086db4511d2..5d6e98a79a5e4b 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -107,8 +107,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern unsigned int rds_ib_mr_1m_pool_size; -extern unsigned int rds_ib_mr_8k_pool_size; extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, From f7df4923fa986247e93ec2cdff5ca168fff14dcf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 28 Feb 2017 08:55:40 +0100 Subject: [PATCH 17/69] mlxsw: spectrum_router: Avoid potential packets loss When the structure of the LPM tree changes (f.e., due to the addition of a new prefix), we unbind the old tree and then bind the new one. This may result in temporary packet loss. Instead, overwrite the old binding with the new one. Fixes: 6b75c4807db3 ("mlxsw: spectrum_router: Add virtual router management") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index d7ac22d7f94029..bd8de6b9be718f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -441,30 +441,40 @@ static int mlxsw_sp_vr_lpm_tree_check(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr, struct mlxsw_sp_prefix_usage *req_prefix_usage) { - struct mlxsw_sp_lpm_tree *lpm_tree; + struct mlxsw_sp_lpm_tree *lpm_tree = vr->lpm_tree; + struct mlxsw_sp_lpm_tree *new_tree; + int err; - if (mlxsw_sp_prefix_usage_eq(req_prefix_usage, - &vr->lpm_tree->prefix_usage)) + if (mlxsw_sp_prefix_usage_eq(req_prefix_usage, &lpm_tree->prefix_usage)) return 0; - lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage, + new_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage, vr->proto, false); - if (IS_ERR(lpm_tree)) { + if (IS_ERR(new_tree)) { /* We failed to get a tree according to the required * prefix usage. However, the current tree might be still good * for us if our requirement is subset of the prefixes used * in the tree. */ if (mlxsw_sp_prefix_usage_subset(req_prefix_usage, - &vr->lpm_tree->prefix_usage)) + &lpm_tree->prefix_usage)) return 0; - return PTR_ERR(lpm_tree); + return PTR_ERR(new_tree); } - mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, vr); - mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree); + /* Prevent packet loss by overwriting existing binding */ + vr->lpm_tree = new_tree; + err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr); + if (err) + goto err_tree_bind; + mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree); + + return 0; + +err_tree_bind: vr->lpm_tree = lpm_tree; - return mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr); + mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree); + return err; } static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, From 0bf09c397e13b70ab6064eb2af69b8fa8e68a24e Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 28 Feb 2017 10:39:48 +0200 Subject: [PATCH 18/69] MAINTAINERS: Orphan usb/net/hso driver The email address of Jan Dumon bounces, and there is not relevant information in the linked website. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 846f97aa350855..5a00239fd7b311 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6012,9 +6012,8 @@ F: include/linux/hsi/ F: include/uapi/linux/hsi/ HSO 3G MODEM DRIVER -M: Jan Dumon -W: http://www.pharscape.org -S: Maintained +L: linux-usb@vger.kernel.org +S: Orphan F: drivers/net/usb/hso.c HSR NETWORK PROTOCOL From 4f3de46f7a57a8ecc16c7ef69c6917b3731a7c5f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Feb 2017 11:58:22 +0000 Subject: [PATCH 19/69] net: usb: asix_devices: fix missing return code check on call to asix_write_medium_mode The call to asix_write_medium_mode is not updating the return code ret and yet ret is being checked for an error. Fix this by assigning ret to the return code from the call asix_write_medium_mode. Detected by CoverityScan, CID#1357148 ("Logically Dead Code") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/usb/asix_devices.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 6e98ede997d3f0..0dd510604118bc 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -346,7 +346,7 @@ static int ax88772_reset(struct usbnet *dev) if (ret < 0) goto out; - asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, 0); + ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, 0); if (ret < 0) goto out; From b2d0fe35471d1a71471f99147ffb5986bd60e744 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 28 Feb 2017 15:02:15 +0300 Subject: [PATCH 20/69] net/mlx4: && vs & typo Bitwise & was obviously intended here. Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist") Signed-off-by: Dan Carpenter Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index e965e5090d9622..a858bcb6220b5d 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac) int i; for (i = ETH_ALEN; i > 0; i--) { - addr[i - 1] = mac && 0xFF; + addr[i - 1] = mac & 0xFF; mac >>= 8; } } From 39e6c8208d7b6fb9d2047850fb3327db567b564b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Feb 2017 10:34:50 -0800 Subject: [PATCH 21/69] net: solve a NAPI race While playing with mlx4 hardware timestamping of RX packets, I found that some packets were received by TCP stack with a ~200 ms delay... Since the timestamp was provided by the NIC, and my probe was added in tcp_v4_rcv() while in BH handler, I was confident it was not a sender issue, or a drop in the network. This would happen with a very low probability, but hurting RPC workloads. A NAPI driver normally arms the IRQ after the napi_complete_done(), after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab it. Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit while IRQ are not disabled, we might have later an IRQ firing and finding this bit set, right before napi_complete_done() clears it. This can happen with busy polling users, or if gro_flush_timeout is used. But some other uses of napi_schedule() in drivers can cause this as well. thread 1 thread 2 (could be on same cpu, or not) // busy polling or napi_watchdog() napi_schedule(); ... napi->poll() device polling: read 2 packets from ring buffer Additional 3rd packet is available. device hard irq // does nothing because NAPI_STATE_SCHED bit is owned by thread 1 napi_schedule(); napi_complete_done(napi, 2); rearm_irq(); Note that rearm_irq() will not force the device to send an additional IRQ for the packet it already signaled (3rd packet in my example) This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() can set if it could not grab NAPI_STATE_SCHED Then napi_complete_done() properly reschedules the napi to make sure we do not miss something. Since we manipulate multiple bits at once, use cmpxchg() like in sk_busy_loop() to provide proper transactions. In v2, I changed napi_watchdog() to use a relaxed variant of napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. In v3, I added more details in the changelog and clears NAPI_STATE_MISSED in busy_poll_stop() In v4, I added the ideas given by Alexander Duyck in v3 review Signed-off-by: Eric Dumazet Cc: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 +++++---------- net/core/dev.c | 76 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f40f0ab3847a8c..97456b2539e46d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -330,6 +330,7 @@ struct napi_struct { enum { NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ @@ -338,12 +339,13 @@ enum { }; enum { - NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), - NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), - NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), - NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { @@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } -/** - * napi_schedule_prep - check if NAPI can be scheduled - * @n: NAPI context - * - * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable to - * insure only one NAPI poll instance runs. We also make - * sure there is no pending NAPI disable. - */ -static inline bool napi_schedule_prep(struct napi_struct *n) -{ - return !napi_disable_pending(n) && - !test_and_set_bit(NAPI_STATE_SCHED, &n->state); -} +bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll diff --git a/net/core/dev.c b/net/core/dev.c index 304f2deae5f989..e63bf61b19be02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4883,6 +4883,39 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule @@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags; + unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list @@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } - WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); @@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); - if (napi->gro_list) - napi_schedule_irqoff(napi); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (napi->gro_list && !napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } From 56de859e9967c070464a9a9f4f18d73f9447298e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 Feb 2017 11:43:36 -0800 Subject: [PATCH 22/69] vxlan: lock RCU on TX path There is no guarantees that callers of the TX path will hold the RCU lock. Grab it explicitly. Fixes: c6fcc4fc5f8b ("vxlan: avoid using stale vxlan socket.") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index b7911994112aeb..e375560cc74e5f 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2105,6 +2105,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); + rcu_read_lock(); if (dst->sa.sa_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; @@ -2127,7 +2128,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port, vni, &rt->dst, rt->rt_flags); if (err) - return; + goto out_unlock; } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { df = htons(IP_DF); } @@ -2166,7 +2167,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port, vni, ndst, rt6i_flags); if (err) - return; + goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); @@ -2183,6 +2184,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, label, src_port, dst_port, !udp_sum); #endif } +out_unlock: + rcu_read_unlock(); return; drop: @@ -2191,6 +2194,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; tx_error: + rcu_read_unlock(); if (err == -ELOOP) dev->stats.collisions++; else if (err == -ENETUNREACH) From a717e3f740803cc88bd5c9a70c93504f6a368663 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 24 Feb 2017 11:43:37 -0800 Subject: [PATCH 23/69] geneve: lock RCU on TX path There is no guarantees that callers of the TX path will hold the RCU lock. Grab it explicitly. Fixes: fceb9c3e3825 ("geneve: avoid using stale geneve socket.") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 45301cb98bc1c2..7074b40ebd7f8e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -881,12 +881,14 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) info = &geneve->info; } + rcu_read_lock(); #if IS_ENABLED(CONFIG_IPV6) if (info->mode & IP_TUNNEL_INFO_IPV6) err = geneve6_xmit_skb(skb, dev, geneve, info); else #endif err = geneve_xmit_skb(skb, dev, geneve, info); + rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; From 8c171d6ca56c6891372a97af26b58b2cfad7fd9a Mon Sep 17 00:00:00 2001 From: Felix Jia Date: Mon, 27 Feb 2017 12:41:23 +1300 Subject: [PATCH 24/69] net/ipv6: avoid possible dead locking on addr_gen_mode sysctl The addr_gen_mode variable can be accessed by both sysctl and netlink. Repleacd rtnl_lock() with rtnl_trylock() protect the sysctl operation to avoid the possbile dead lock.` Signed-off-by: Felix Jia Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3a2025f5bf2c33..cfc485a8e1c028 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5692,13 +5692,18 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; struct net *net = (struct net *)ctl->extra2; + if (!rtnl_trylock()) + return restart_syscall(); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { new_val = *((int *)ctl->data); - if (check_addr_gen_mode(new_val) < 0) - return -EINVAL; + if (check_addr_gen_mode(new_val) < 0) { + ret = -EINVAL; + goto out; + } /* request for default */ if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { @@ -5707,20 +5712,23 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, /* request for individual net device */ } else { if (!idev) - return ret; + goto out; - if (check_stable_privacy(idev, net, new_val) < 0) - return -EINVAL; + if (check_stable_privacy(idev, net, new_val) < 0) { + ret = -EINVAL; + goto out; + } if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - rtnl_lock(); addrconf_dev_config(idev->dev); - rtnl_unlock(); } } } +out: + rtnl_unlock(); + return ret; } From 3b45a4106f146c336cbcaccb9d8d0fa0e5c3dc1d Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 27 Feb 2017 20:59:39 +0800 Subject: [PATCH 25/69] net: route: add missing nla_policy entry for RTA_MARK attribute This will add stricter validating for RTA_MARK attribute. Signed-off-by: Liping Zhang Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 + net/ipv6/route.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b39a791f6756fc..42bfd08109dd78 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f54f4265b37f29..d94f1dfa54c842 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2891,6 +2891,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, From df2c43343b47a7138431f431118eb5819e205365 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 1 Mar 2017 16:50:45 +0200 Subject: [PATCH 26/69] bridge: Fix error path in nbp_vlan_init Fix error path order in nbp_vlan_init, so if switchdev_port_attr_set call failes, the vlan_hash wouldn't be destroyed before inited. Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support") CC: Roopa Prabhu Signed-off-by: Yotam Gigi Acked-by: Roopa Prabhu Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 62e68c0dc68740..b838213c408e24 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -997,10 +997,10 @@ int nbp_vlan_init(struct net_bridge_port *p) RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); -err_vlan_enabled: err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: +err_vlan_enabled: kfree(vg); goto out; From eba38a968258b5ad9d70722ab8c584e1753f4b16 Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 1 Mar 2017 16:25:51 +0800 Subject: [PATCH 27/69] bpf: update the comment about the length of analysis Commit 07016151a446 ("bpf, verifier: further improve search pruning") increased the limit of processed instructions from 32k to 64k, but the comment still mentioned the 32k limit. This commit updates the comment to reflect the change. Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Gary Lin Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3fc6e39b223e2c..796b68d001198a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -33,7 +33,7 @@ * - out of bounds or malformed jumps * The second pass is all possible path descent from the 1st insn. * Since it's analyzing all pathes through the program, the length of the - * analysis is limited to 32k insn, which may be hit even if total number of + * analysis is limited to 64k insn, which may be hit even if total number of * insn is less then 4K, but there are too many branches that change stack/regs. * Number of 'branches to be analyzed' is limited to 1k * From 449809a66c1d0b1563dee84493e14bf3104d2d7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 08:39:49 -0800 Subject: [PATCH 28/69] tcp/dccp: block BH for SYN processing SYN processing really was meant to be handled from BH. When I got rid of BH blocking while processing socket backlog in commit 5413d1babe8f ("net: do not block BH while processing socket backlog"), I forgot that a malicious user could transition to TCP_LISTEN from a state that allowed (SYN) packets to be parked in the socket backlog while socket is owned by the thread doing the listen() call. Sure enough syzkaller found this and reported the bug ;) ================================= [ INFO: inconsistent lock state ] 4.10.0+ #60 Not tainted --------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes: (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: [] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 {IN-SOFTIRQ-W} state was registered at: mark_irqflags kernel/locking/lockdep.c:2923 [inline] __lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295 lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399 tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262 tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889 tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433 tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711 ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:257 [inline] ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:492 [inline] ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396 NF_HOOK include/linux/netfilter.h:257 [inline] ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487 __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179 __netif_receive_skb+0x2a/0x170 net/core/dev.c:4217 netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245 napi_skb_finish net/core/dev.c:4602 [inline] napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636 e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline] e1000_clean_rx_irq+0x5e0/0x1490 drivers/net/ethernet/intel/e1000/e1000_main.c:4489 e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834 napi_poll net/core/dev.c:5171 [inline] net_rx_action+0xe70/0x1900 net/core/dev.c:5236 __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x19e/0x1d0 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:658 [inline] do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250 ret_from_intr+0x0/0x20 native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53 arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline] default_idle+0x8f/0x410 arch/x86/kernel/process.c:271 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262 default_idle_call+0x36/0x60 kernel/sched/idle.c:96 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x348/0x440 kernel/sched/idle.c:243 cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345 start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272 verify_cpu+0x0/0xfc irq event stamp: 1741 hardirqs last enabled at (1741): [] __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 [inline] hardirqs last enabled at (1741): [] _raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191 hardirqs last disabled at (1740): [] __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline] hardirqs last disabled at (1740): [] _raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159 softirqs last enabled at (1738): [] __do_softirq+0x7cf/0xb7d kernel/softirq.c:310 softirqs last disabled at (1571): [] do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&hashinfo->ehash_locks[i])->rlock); lock(&(&hashinfo->ehash_locks[i])->rlock); *** DEADLOCK *** 1 lock held by syz-executor0/5090: #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock include/net/sock.h:1460 [inline] #0: (sk_lock-AF_INET6){+.+.+.}, at: [] sock_setsockopt+0x233/0x1e40 net/core/sock.c:683 stack backtrace: CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:15 [inline] dump_stack+0x292/0x398 lib/dump_stack.c:51 print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387 valid_state kernel/locking/lockdep.c:2400 [inline] mark_lock_irq kernel/locking/lockdep.c:2602 [inline] mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065 mark_irqflags kernel/locking/lockdep.c:2941 [inline] __lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295 lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380 dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606 dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632 sk_backlog_rcv include/net/sock.h:896 [inline] __release_sock+0x127/0x3a0 net/core/sock.c:2052 release_sock+0xa5/0x2b0 net/core/sock.c:2539 sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016 SYSC_setsockopt net/socket.c:1782 [inline] SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: 0033:0x4458b9 RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9 RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006 RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000 R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000 R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000 Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/dccp/input.c | 10 ++++++++-- net/ipv4/tcp_input.c | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 8fedc2d497709b..4a05d78768502d 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const int old_state = sk->sk_state; + bool acceptable; int queued = 0; /* @@ -603,8 +604,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (inet_csk(sk)->icsk_af_ops->conn_request(sk, - skb) < 0) + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) return 1; consume_skb(skb); return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2c0ff327b6dfe6..39c393cc0fd3c1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5886,9 +5886,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->syn) { if (th->fin) goto discard; - if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) - return 1; + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) + return 1; consume_skb(skb); return 0; } From 8953de2f02ad7b15e4964c82f9afd60f128e4e98 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Wed, 1 Mar 2017 09:55:28 +0000 Subject: [PATCH 29/69] net: bridge: allow IPv6 when multicast flood is disabled Even with multicast flooding turned off, IPv6 ND should still work so that IPv6 connectivity is provided. Allow this by continuing to flood multicast traffic originated by us. Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") Cc: Nikolay Aleksandrov Signed-off-by: Mike Manning Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6bfac29318f21e..902af6ba481c99 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + /* Do not flood if mc off, except for traffic we originate */ if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD)) + !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; /* Do not flood to ports that enable proxy ARP */ From 540e2894f7905538740aaf122bd8e0548e1c34a4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 1 Mar 2017 12:57:20 +0100 Subject: [PATCH 30/69] net: don't call strlen() on the user buffer in packet_bind_spkt() KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of uninitialized memory in packet_bind_spkt(): Acked-by: Eric Dumazet ================================================================== BUG: KMSAN: use of unitialized memory CPU: 0 PID: 1074 Comm: packet Not tainted 4.8.0-rc6+ #1891 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 0000000000000000 ffff88006b6dfc08 ffffffff82559ae8 ffff88006b6dfb48 ffffffff818a7c91 ffffffff85b9c870 0000000000000092 ffffffff85b9c550 0000000000000000 0000000000000092 00000000ec400911 0000000000000002 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x238/0x290 lib/dump_stack.c:51 [] kmsan_report+0x276/0x2e0 mm/kmsan/kmsan.c:1003 [] __msan_warning+0x5b/0xb0 mm/kmsan/kmsan_instr.c:424 [< inline >] strlen lib/string.c:484 [] strlcpy+0x9d/0x200 lib/string.c:144 [] packet_bind_spkt+0x144/0x230 net/packet/af_packet.c:3132 [] SYSC_bind+0x40d/0x5f0 net/socket.c:1370 [] SyS_bind+0x82/0xa0 net/socket.c:1356 [] entry_SYSCALL_64_fastpath+0x13/0x8f arch/x86/entry/entry_64.o:? chained origin: 00000000eba00911 [] save_stack_trace+0x27/0x50 arch/x86/kernel/stacktrace.c:67 [< inline >] kmsan_save_stack_with_flags mm/kmsan/kmsan.c:322 [< inline >] kmsan_save_stack mm/kmsan/kmsan.c:334 [] kmsan_internal_chain_origin+0x118/0x1e0 mm/kmsan/kmsan.c:527 [] __msan_set_alloca_origin4+0xc3/0x130 mm/kmsan/kmsan_instr.c:380 [] SYSC_bind+0x129/0x5f0 net/socket.c:1356 [] SyS_bind+0x82/0xa0 net/socket.c:1356 [] entry_SYSCALL_64_fastpath+0x13/0x8f arch/x86/entry/entry_64.o:? origin description: ----address@SYSC_bind (origin=00000000eb400911) ================================================================== (the line numbers are relative to 4.8-rc6, but the bug persists upstream) , when I run the following program as root: ===================================== #include #include #include #include int main() { struct sockaddr addr; memset(&addr, 0xff, sizeof(addr)); addr.sa_family = AF_PACKET; int fd = socket(PF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); bind(fd, &addr, sizeof(addr)); return 0; } ===================================== This happens because addr.sa_data copied from the userspace is not zero-terminated, and copying it with strlcpy() in packet_bind_spkt() results in calling strlen() on the kernel copy of that non-terminated buffer. Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2bd0d1949312c3..a0dbe7ca8f724c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3103,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[15]; + char name[sizeof(uaddr->sa_data) + 1]; /* * Check legality @@ -3111,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, if (addr_len != sizeof(struct sockaddr)) return -EINVAL; - strlcpy(name, uaddr->sa_data, sizeof(name)); + /* uaddr->sa_data comes from the userspace, it's not guaranteed to be + * zero-terminated. + */ + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); + name[sizeof(uaddr->sa_data)] = 0; return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } From 13baa00ad01bb3a9f893e3a08cbc2d072fc0c15d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 14:28:39 -0800 Subject: [PATCH 31/69] net: net_enable_timestamp() can be called from irq contexts It is now very clear that silly TCP listeners might play with enabling/disabling timestamping while new children are added to their accept queue. Meaning net_enable_timestamp() can be called from BH context while current state of the static key is not enabled. Lets play safe and allow all contexts. The work queue is scheduled only under the problematic cases, which are the static key enable/disable transition, to not slow down critical paths. This extends and improves what we did in commit 5fa8bbda38c6 ("net: use a work queue to defer net_disable_timestamp() work") Fixes: b90e5794c5bd ("net: dont call jump_label_dec from irq context") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index e63bf61b19be02..8637b2b71f3d47 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1698,27 +1698,54 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; - while (deferred--) - static_key_slow_dec(&netstamp_needed); + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_key_enable(&netstamp_needed); + else + static_key_disable(&netstamp_needed); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { +#ifdef HAVE_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + return; + } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else static_key_slow_inc(&netstamp_needed); +#endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef HAVE_JUMP_LABEL - /* net_disable_timestamp() can be called from non process context */ - atomic_inc(&netstamp_needed_deferred); + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + return; + } + atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_key_slow_dec(&netstamp_needed); From 48cac18ecf1de82f76259a54402c3adb7839ad01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Mar 2017 14:45:06 -0800 Subject: [PATCH 32/69] ipv6: orphan skbs in reassembly unit Andrey reported a use-after-free in IPv6 stack. Issue here is that we free the socket while it still has skb in TX path and in some queues. It happens here because IPv6 reassembly unit messes skb->truesize, breaking skb_set_owner_w() badly. We fixed a similar issue for IPV4 in commit 8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()") Acked-by: Joe Stringer ================================================================== BUG: KASAN: use-after-free in sock_wfree+0x118/0x120 Read of size 8 at addr ffff880062da0060 by task a.out/4140 page:ffffea00018b6800 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 flags: 0x100000000008100(slab|head) raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013 raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000 page dumped because: kasan: bad access detected CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:15 dump_stack+0x292/0x398 lib/dump_stack.c:51 describe_address mm/kasan/report.c:262 kasan_report_error+0x121/0x560 mm/kasan/report.c:370 kasan_report mm/kasan/report.c:392 __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413 sock_flag ./arch/x86/include/asm/bitops.h:324 sock_wfree+0x118/0x120 net/core/sock.c:1631 skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655 skb_release_all+0x15/0x60 net/core/skbuff.c:668 __kfree_skb+0x15/0x20 net/core/skbuff.c:684 kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705 inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304 inet_frag_put ./include/net/inet_frag.h:133 nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617 ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68 nf_hook_entry_hookfn ./include/linux/netfilter.h:102 nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310 nf_hook ./include/linux/netfilter.h:212 __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160 ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170 ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722 ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742 rawv6_push_pending_frames net/ipv6/raw.c:613 rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744 sock_sendmsg_nosec net/socket.c:635 sock_sendmsg+0xca/0x110 net/socket.c:645 sock_write_iter+0x326/0x620 net/socket.c:848 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x187/0x530 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 RIP: 0033:0x7ff26e6f5b79 RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79 RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003 RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003 The buggy address belongs to the object at ffff880062da0000 which belongs to the cache RAWv6 of size 1504 The buggy address ffff880062da0060 is located 96 bytes inside of 1504-byte region [ffff880062da0000, ffff880062da05e0) Freed by task 4113: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 save_stack+0x43/0xd0 mm/kasan/kasan.c:502 set_track mm/kasan/kasan.c:514 kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578 slab_free_hook mm/slub.c:1352 slab_free_freelist_hook mm/slub.c:1374 slab_free mm/slub.c:2951 kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973 sk_prot_free net/core/sock.c:1377 __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452 sk_destruct+0x47/0x80 net/core/sock.c:1460 __sk_free+0x57/0x230 net/core/sock.c:1468 sk_free+0x23/0x30 net/core/sock.c:1479 sock_put ./include/net/sock.h:1638 sk_common_release+0x31e/0x4e0 net/core/sock.c:2782 rawv6_close+0x54/0x80 net/ipv6/raw.c:1214 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431 sock_release+0x8d/0x1e0 net/socket.c:599 sock_close+0x16/0x20 net/socket.c:1063 __fput+0x332/0x7f0 fs/file_table.c:208 ____fput+0x15/0x20 fs/file_table.c:244 task_work_run+0x19b/0x270 kernel/task_work.c:116 exit_task_work ./include/linux/task_work.h:21 do_exit+0x186b/0x2800 kernel/exit.c:839 do_group_exit+0x149/0x420 kernel/exit.c:943 SYSC_exit_group kernel/exit.c:954 SyS_exit_group+0x1d/0x20 kernel/exit.c:952 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 Allocated by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 save_stack+0x43/0xd0 mm/kasan/kasan.c:502 set_track mm/kasan/kasan.c:514 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544 slab_post_alloc_hook mm/slab.h:432 slab_alloc_node mm/slub.c:2708 slab_alloc mm/slub.c:2716 kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721 sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334 sk_alloc+0x105/0x1010 net/core/sock.c:1396 inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183 __sock_create+0x4f6/0x880 net/socket.c:1199 sock_create net/socket.c:1239 SYSC_socket net/socket.c:1269 SyS_socket+0xf9/0x230 net/socket.c:1249 entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 Memory state around the buggy address: ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: Andrey Konovalov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/openvswitch/conntrack.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 9948b5ce52dad3..986d4ca38832b1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 85cd5952667068..e0a87776a010a3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -485,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) { From eb1e011a14748a1d9df9a7d7df9a5711721a1bdb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Feb 2017 09:49:26 +0100 Subject: [PATCH 33/69] average: change to declare precision, not factor Declaring the factor is counter-intuitive, and people are prone to using small(-ish) values even when that makes no sense. Change the DECLARE_EWMA() macro to take the fractional precision, in bits, rather than a factor, and update all users. While at it, add some more documentation. Acked-by: David S. Miller Signed-off-by: Johannes Berg --- drivers/net/virtio_net.c | 2 +- drivers/net/wireless/ath/ath5k/ath5k.h | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 2 +- include/linux/average.h | 61 ++++++++++++++------- net/batman-adv/types.h | 2 +- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/sta_info.h | 2 +- 7 files changed, 47 insertions(+), 26 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bf95016f442ace..e9d7e2b70085ee 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -51,7 +51,7 @@ module_param(gso, bool, 0444); * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size. */ -DECLARE_EWMA(pkt_len, 1, 64) +DECLARE_EWMA(pkt_len, 0, 64) /* With mergeable buffers we align buffer address and use the low bits to * encode its true size. Buffer size is up to 1 page so we need to align to diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h index 67fedb61fcc02d..979800c6f57fba 100644 --- a/drivers/net/wireless/ath/ath5k/ath5k.h +++ b/drivers/net/wireless/ath/ath5k/ath5k.h @@ -1252,7 +1252,7 @@ struct ath5k_statistics { #define ATH5K_TXQ_LEN_MAX (ATH_TXBUF / 4) /* bufs per queue */ #define ATH5K_TXQ_LEN_LOW (ATH5K_TXQ_LEN_MAX / 2) /* low mark */ -DECLARE_EWMA(beacon_rssi, 1024, 8) +DECLARE_EWMA(beacon_rssi, 10, 8) /* Driver state associated with an instance of a device */ struct ath5k_hw { diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index 26869b3bef45ff..340787894c694a 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -257,7 +257,7 @@ struct link_qual { int tx_failed; }; -DECLARE_EWMA(rssi, 1024, 8) +DECLARE_EWMA(rssi, 10, 8) /* * Antenna settings about the currently active link. diff --git a/include/linux/average.h b/include/linux/average.h index d04aa58280ded5..7ddaf340d2ac98 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -1,45 +1,66 @@ #ifndef _LINUX_AVERAGE_H #define _LINUX_AVERAGE_H -/* Exponentially weighted moving average (EWMA) */ +/* + * Exponentially weighted moving average (EWMA) + * + * This implements a fixed-precision EWMA algorithm, with both the + * precision and fall-off coefficient determined at compile-time + * and built into the generated helper funtions. + * + * The first argument to the macro is the name that will be used + * for the struct and helper functions. + * + * The second argument, the precision, expresses how many bits are + * used for the fractional part of the fixed-precision values. + * + * The third argument, the weight reciprocal, determines how the + * new values will be weighed vs. the old state, new values will + * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note + * that this parameter must be a power of two for efficiency. + */ -#define DECLARE_EWMA(name, _factor, _weight) \ +#define DECLARE_EWMA(name, _precision, _weight_rcp) \ struct ewma_##name { \ unsigned long internal; \ }; \ static inline void ewma_##name##_init(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + /* \ + * Even if you want to feed it just 0/1 you should have \ + * some bits for the non-fractional part... \ + */ \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ e->internal = 0; \ } \ static inline unsigned long \ ewma_##name##_read(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ - return e->internal >> ilog2(_factor); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + return e->internal >> (_precision); \ } \ static inline void ewma_##name##_add(struct ewma_##name *e, \ unsigned long val) \ { \ unsigned long internal = ACCESS_ONCE(e->internal); \ - unsigned long weight = ilog2(_weight); \ - unsigned long factor = ilog2(_factor); \ + unsigned long weight_rcp = ilog2(_weight_rcp); \ + unsigned long precision = _precision; \ \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ \ ACCESS_ONCE(e->internal) = internal ? \ - (((internal << weight) - internal) + \ - (val << factor)) >> weight : \ - (val << factor); \ + (((internal << weight_rcp) - internal) + \ + (val << precision)) >> weight_rcp : \ + (val << precision); \ } #endif /* _LINUX_AVERAGE_H */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8f64a5c013454a..66b25e410a4137 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -402,7 +402,7 @@ struct batadv_gw_node { struct rcu_head rcu; }; -DECLARE_EWMA(throughput, 1024, 8) +DECLARE_EWMA(throughput, 10, 8) /** * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 159a1a73372506..0e718437d080e7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -428,7 +428,7 @@ struct ieee80211_sta_tx_tspec { bool downgraded; }; -DECLARE_EWMA(beacon_signal, 16, 4) +DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 15599c70a38fc9..e65cda34d2bc00 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -372,7 +372,7 @@ struct mesh_sta { unsigned int fail_avg; }; -DECLARE_EWMA(signal, 1024, 8) +DECLARE_EWMA(signal, 10, 8) struct ieee80211_sta_rx_stats { unsigned long packets; From 1657b8f84ed9fc1d2a100671f1d42d6286f20073 Mon Sep 17 00:00:00 2001 From: Waldemar Rymarkiewicz Date: Fri, 24 Feb 2017 23:30:03 +0100 Subject: [PATCH 34/69] ath10k: search SMBIOS for OEM board file extension Board Data File (BDF) is loaded upon driver boot-up procedure. The right board data file is identified, among others, by device and sybsystem ids. The problem, however, can occur when the (default) board data file cannot fulfill with the vendor requirements and it is necessary to use a different board data file. To solve the issue QCA uses SMBIOS type 0xF8 to store Board Data File Name Extension to specify the extension/variant name. The driver will take the extension suffix into consideration and will load the right (non-default) board data file if necessary. If it is unnecessary to use extension board data file, please leave the SMBIOS field blank and default configuration will be used. Example: If a default board data file for a specific board is identified by a string "bus=pci,vendor=168c,device=003e,subsystem-vendor=1028, subsystem-device=0310" then the OEM specific data file, if used, could be identified by variant suffix: "bus=pci,vendor=168c,device=003e,subsystem-vendor=1028, subsystem-device=0310,variant=DE_1AB" If board data file name extension is set but board-2.bin does not contain board data file for the variant, the driver will fallback to the default board data file not to break backward compatibility. This was first applied in commit f2593cb1b291 ("ath10k: Search SMBIOS for OEM board file extension") but later reverted in commit 005c3490e9db ("Revert "ath10k: Search SMBIOS for OEM board file extension"". This patch is now otherwise the same as commit f2593cb1b291 except the regression fixed. Signed-off-by: Waldemar Rymarkiewicz Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/core.c | 101 ++++++++++++++++++++++++- drivers/net/wireless/ath/ath10k/core.h | 19 +++++ 2 files changed, 117 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index dd902b43f8f775..0a8e29e9a0ebc7 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include "core.h" @@ -711,6 +713,72 @@ static int ath10k_core_get_board_id_from_otp(struct ath10k *ar) return 0; } +static void ath10k_core_check_bdfext(const struct dmi_header *hdr, void *data) +{ + struct ath10k *ar = data; + const char *bdf_ext; + const char *magic = ATH10K_SMBIOS_BDF_EXT_MAGIC; + u8 bdf_enabled; + int i; + + if (hdr->type != ATH10K_SMBIOS_BDF_EXT_TYPE) + return; + + if (hdr->length != ATH10K_SMBIOS_BDF_EXT_LENGTH) { + ath10k_dbg(ar, ATH10K_DBG_BOOT, + "wrong smbios bdf ext type length (%d).\n", + hdr->length); + return; + } + + bdf_enabled = *((u8 *)hdr + ATH10K_SMBIOS_BDF_EXT_OFFSET); + if (!bdf_enabled) { + ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant name not found.\n"); + return; + } + + /* Only one string exists (per spec) */ + bdf_ext = (char *)hdr + hdr->length; + + if (memcmp(bdf_ext, magic, strlen(magic)) != 0) { + ath10k_dbg(ar, ATH10K_DBG_BOOT, + "bdf variant magic does not match.\n"); + return; + } + + for (i = 0; i < strlen(bdf_ext); i++) { + if (!isascii(bdf_ext[i]) || !isprint(bdf_ext[i])) { + ath10k_dbg(ar, ATH10K_DBG_BOOT, + "bdf variant name contains non ascii chars.\n"); + return; + } + } + + /* Copy extension name without magic suffix */ + if (strscpy(ar->id.bdf_ext, bdf_ext + strlen(magic), + sizeof(ar->id.bdf_ext)) < 0) { + ath10k_dbg(ar, ATH10K_DBG_BOOT, + "bdf variant string is longer than the buffer can accommodate (variant: %s)\n", + bdf_ext); + return; + } + + ath10k_dbg(ar, ATH10K_DBG_BOOT, + "found and validated bdf variant smbios_type 0x%x bdf %s\n", + ATH10K_SMBIOS_BDF_EXT_TYPE, bdf_ext); +} + +static int ath10k_core_check_smbios(struct ath10k *ar) +{ + ar->id.bdf_ext[0] = '\0'; + dmi_walk(ath10k_core_check_bdfext, ar); + + if (ar->id.bdf_ext[0] == '\0') + return -ENODATA; + + return 0; +} + static int ath10k_download_and_run_otp(struct ath10k *ar) { u32 result, address = ar->hw_params.patch_load_addr; @@ -1020,6 +1088,23 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar, case ATH10K_BD_IE_BOARD: ret = ath10k_core_parse_bd_ie_board(ar, data, ie_len, boardname); + if (ret == -ENOENT && ar->id.bdf_ext[0] != '\0') { + /* try default bdf if variant was not found */ + char *s, *v = ",variant="; + char boardname2[100]; + + strlcpy(boardname2, boardname, + sizeof(boardname2)); + + s = strstr(boardname2, v); + if (s) + *s = '\0'; /* strip ",variant=%s" */ + + ret = ath10k_core_parse_bd_ie_board(ar, data, + ie_len, + boardname2); + } + if (ret == -ENOENT) /* no match found, continue */ break; @@ -1057,6 +1142,9 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar, static int ath10k_core_create_board_name(struct ath10k *ar, char *name, size_t name_len) { + /* strlen(',variant=') + strlen(ar->id.bdf_ext) */ + char variant[9 + ATH10K_SMBIOS_BDF_EXT_STR_LENGTH] = { 0 }; + if (ar->id.bmi_ids_valid) { scnprintf(name, name_len, "bus=%s,bmi-chip-id=%d,bmi-board-id=%d", @@ -1066,12 +1154,15 @@ static int ath10k_core_create_board_name(struct ath10k *ar, char *name, goto out; } + if (ar->id.bdf_ext[0] != '\0') + scnprintf(variant, sizeof(variant), ",variant=%s", + ar->id.bdf_ext); + scnprintf(name, name_len, - "bus=%s,vendor=%04x,device=%04x,subsystem-vendor=%04x,subsystem-device=%04x", + "bus=%s,vendor=%04x,device=%04x,subsystem-vendor=%04x,subsystem-device=%04x%s", ath10k_bus_str(ar->hif.bus), ar->id.vendor, ar->id.device, - ar->id.subsystem_vendor, ar->id.subsystem_device); - + ar->id.subsystem_vendor, ar->id.subsystem_device, variant); out: ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot using board name '%s'\n", name); @@ -2128,6 +2219,10 @@ static int ath10k_core_probe_fw(struct ath10k *ar) goto err_free_firmware_files; } + ret = ath10k_core_check_smbios(ar); + if (ret) + ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant name not set.\n"); + ret = ath10k_core_fetch_board_file(ar); if (ret) { ath10k_err(ar, "failed to fetch board file: %d\n", ret); diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 757242ef52ac14..88d14be7fcceb4 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -69,6 +69,23 @@ #define ATH10K_NAPI_BUDGET 64 #define ATH10K_NAPI_QUOTA_LIMIT 60 +/* SMBIOS type containing Board Data File Name Extension */ +#define ATH10K_SMBIOS_BDF_EXT_TYPE 0xF8 + +/* SMBIOS type structure length (excluding strings-set) */ +#define ATH10K_SMBIOS_BDF_EXT_LENGTH 0x9 + +/* Offset pointing to Board Data File Name Extension */ +#define ATH10K_SMBIOS_BDF_EXT_OFFSET 0x8 + +/* Board Data File Name Extension string length. + * String format: BDF__\0 + */ +#define ATH10K_SMBIOS_BDF_EXT_STR_LENGTH 0x20 + +/* The magic used by QCA spec */ +#define ATH10K_SMBIOS_BDF_EXT_MAGIC "BDF_" + struct ath10k; enum ath10k_bus { @@ -798,6 +815,8 @@ struct ath10k { bool bmi_ids_valid; u8 bmi_board_id; u8 bmi_chip_id; + + char bdf_ext[ATH10K_SMBIOS_BDF_EXT_STR_LENGTH]; } id; int fw_api; From e3330039ea28dc199e3b2da993895ff742a91adf Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 27 Feb 2017 16:07:43 -0800 Subject: [PATCH 35/69] ipv6: check for ip6_null_entry in __ip6_del_rt_siblings() Andrey reported a NULL pointer deref bug in ipv6_route_ioctl() -> ip6_route_del() -> __ip6_del_rt_siblings() code path. This is because ip6_null_entry is returned in this path since ip6_null_entry is kinda default for a ipv6 route table root node. Quote from David Ahern: ip6_null_entry is the root of all ipv6 fib tables making it integrated into the table ... We should ignore any attempt of trying to delete it, like we do in __ip6_del_rt() path and several others. Reported-by: Andrey Konovalov Fixes: 0ae8133586ad ("net: ipv6: Allow shorthand delete of all nexthops in multipath route") Cc: David Ahern Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d94f1dfa54c842..43ca90d50ae9b3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2169,10 +2169,13 @@ int ip6_del_rt(struct rt6_info *rt) static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; + struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; - int err; + int err = -ENOENT; + if (rt == net->ipv6.ip6_null_entry) + goto out_put; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); @@ -2184,7 +2187,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; - if (rt6_fill_node(info->nl_net, skb, rt, + if (rt6_fill_node(net, skb, rt, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); @@ -2198,17 +2201,18 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) rt6i_siblings) { err = fib6_del(sibling, info); if (err) - goto out; + goto out_unlock; } } err = fib6_del(rt, info); -out: +out_unlock: write_unlock_bh(&table->tb6_lock); +out_put: ip6_rt_put(rt); if (skb) { - rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV6_ROUTE, + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; From 9aea7779b764a11e357d3c74af6aee3cf90f2045 Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Tue, 28 Feb 2017 18:08:55 +0100 Subject: [PATCH 36/69] drivers: net: xgene: Fix crash on DT systems On DT systems the driver require a clock, but the probe just print a warning and continue, leading to a crash when resetting the device. To fix this crash and properly handle probe deferals only ignore the missing clock if DT isn't used or if the clock doesn't exist. Signed-off-by: Alban Bedel Acked-by: Iyappan Subramanian Signed-off-by: David S. Miller --- drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index e536301acfdec9..b3568c453b1451 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -1749,6 +1749,12 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata) pdata->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(pdata->clk)) { + /* Abort if the clock is defined but couldn't be retrived. + * Always abort if the clock is missing on DT system as + * the driver can't cope with this case. + */ + if (PTR_ERR(pdata->clk) != -ENOENT || dev->of_node) + return PTR_ERR(pdata->clk); /* Firmware may have set up the clock already. */ dev_info(dev, "clocks have been setup already\n"); } From 402168b4c2dc0734b8fbd282eff77da0275c5129 Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Tue, 28 Feb 2017 15:02:51 -0600 Subject: [PATCH 37/69] amd-xgbe: Stop the PHY before releasing interrupts Some configurations require the use of the hardware's MDIO support to communicate with external PHYs. The MDIO commands indicate completion through the device interrupt. When bringing down the device the interrupts were released before stopping the external PHY, resulting in MDIO command timeouts. Move the stopping of the PHY to before the releasing of the interrupts. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 3aa457c8ca21d3..248f60d171a5a0 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1131,12 +1131,12 @@ static void xgbe_stop(struct xgbe_prv_data *pdata) hw_if->disable_tx(pdata); hw_if->disable_rx(pdata); + phy_if->phy_stop(pdata); + xgbe_free_irqs(pdata); xgbe_napi_disable(pdata, 1); - phy_if->phy_stop(pdata); - hw_if->exit(pdata); channel = pdata->channel; From b42c6761fd1651f564491b53016046c9ebf0b2a9 Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Tue, 28 Feb 2017 15:03:01 -0600 Subject: [PATCH 38/69] amd-xgbe: Be sure to set MDIO modes on device (re)start The MDIO register mode is set when the device is probed. But when the device is brought down and then back up, the MDIO register mode has been reset. Be sure to reset the mode during device startup and only change the mode of the address specified. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 2 +- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 22 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index a7d16db5c4b21d..937f37a5dcb2cd 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -1323,7 +1323,7 @@ static int xgbe_read_ext_mii_regs(struct xgbe_prv_data *pdata, int addr, static int xgbe_set_ext_mii_mode(struct xgbe_prv_data *pdata, unsigned int port, enum xgbe_mdio_mode mode) { - unsigned int reg_val = 0; + unsigned int reg_val = XGMAC_IOREAD(pdata, MAC_MDIOCL22R); switch (mode) { case XGBE_MDIO_MODE_CL22: diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 9d8c953083b4ef..04804cbb7dc1cd 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -875,6 +875,16 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data *pdata) !phy_data->sfp_phy_avail) return 0; + /* Set the proper MDIO mode for the PHY */ + ret = pdata->hw_if.set_ext_mii_mode(pdata, phy_data->mdio_addr, + phy_data->phydev_mode); + if (ret) { + netdev_err(pdata->netdev, + "mdio port/clause not compatible (%u/%u)\n", + phy_data->mdio_addr, phy_data->phydev_mode); + return ret; + } + /* Create and connect to the PHY device */ phydev = get_phy_device(phy_data->mii, phy_data->mdio_addr, (phy_data->phydev_mode == XGBE_MDIO_MODE_CL45)); @@ -2722,6 +2732,18 @@ static int xgbe_phy_start(struct xgbe_prv_data *pdata) if (ret) return ret; + /* Set the proper MDIO mode for the re-driver */ + if (phy_data->redrv && !phy_data->redrv_if) { + ret = pdata->hw_if.set_ext_mii_mode(pdata, phy_data->redrv_addr, + XGBE_MDIO_MODE_CL22); + if (ret) { + netdev_err(pdata->netdev, + "redriver mdio port not compatible (%u)\n", + phy_data->redrv_addr); + return ret; + } + } + /* Start in highest supported mode */ xgbe_phy_set_mode(pdata, phy_data->start_mode); From 2697ea5a859b83ca49511dcfd98daf42584eb3cf Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Tue, 28 Feb 2017 15:03:10 -0600 Subject: [PATCH 39/69] amd-xgbe: Don't overwrite SFP PHY mod_absent settings If an SFP module is not present, xgbe_phy_sfp_phy_settings() should return after applying the default settings. Currently there is no return statement and the default settings are overwritten. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 04804cbb7dc1cd..e707c49cc55a78 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -716,6 +716,8 @@ static void xgbe_phy_sfp_phy_settings(struct xgbe_prv_data *pdata) pdata->phy.duplex = DUPLEX_UNKNOWN; pdata->phy.autoneg = AUTONEG_ENABLE; pdata->phy.advertising = pdata->phy.supported; + + return; } pdata->phy.advertising &= ~ADVERTISED_Autoneg; From 11bd44f62d86115796109b0349e6e191bc99b45a Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Wed, 1 Mar 2017 11:18:53 +0530 Subject: [PATCH 40/69] cxgb4: update latest firmware version supported Change t4fw_version.h to update latest firmware version number to 1.16.33.0. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h index 5fdaa16426c50e..fa376444e57c56 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h @@ -37,7 +37,7 @@ #define T4FW_VERSION_MAJOR 0x01 #define T4FW_VERSION_MINOR 0x10 -#define T4FW_VERSION_MICRO 0x1A +#define T4FW_VERSION_MICRO 0x21 #define T4FW_VERSION_BUILD 0x00 #define T4FW_MIN_VERSION_MAJOR 0x01 @@ -46,7 +46,7 @@ #define T5FW_VERSION_MAJOR 0x01 #define T5FW_VERSION_MINOR 0x10 -#define T5FW_VERSION_MICRO 0x1A +#define T5FW_VERSION_MICRO 0x21 #define T5FW_VERSION_BUILD 0x00 #define T5FW_MIN_VERSION_MAJOR 0x00 @@ -55,7 +55,7 @@ #define T6FW_VERSION_MAJOR 0x01 #define T6FW_VERSION_MINOR 0x10 -#define T6FW_VERSION_MICRO 0x1A +#define T6FW_VERSION_MICRO 0x21 #define T6FW_VERSION_BUILD 0x00 #define T6FW_MIN_VERSION_MAJOR 0x00 From f1304f7ba3981e71dcf2ac7db92949eeab49b1bf Mon Sep 17 00:00:00 2001 From: Peter Downs Date: Wed, 1 Mar 2017 01:01:17 -0800 Subject: [PATCH 41/69] openvswitch: actions: fixed a brace coding style warning Fixed a brace coding style warning reported by checkpatch.pl Signed-off-by: Peter Downs Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b1beb2b94ec76c..c82301ce3fffb6 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -796,9 +796,8 @@ static void ovs_fragment(struct net *net, struct vport *vport, unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) { + if (!v6ops) goto err; - } prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); From d5afb6f9b6bb2c57bd0c05e76e12489dc0d037d9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 1 Mar 2017 16:35:07 -0300 Subject: [PATCH 42/69] dccp: Unlock sock before calling sk_free() The code where sk_clone() came from created a new socket and locked it, but then, on the error path didn't unlock it. This problem stayed there for a long while, till b0691c8ee7c2 ("net: Unlock sock before calling sk_free()") fixed it, but unfortunately the callers of sk_clone() (now sk_clone_locked()) were not audited and the one in dccp_create_openreq_child() remained. Now in the age of the syskaller fuzzer, this was finally uncovered, as reported by Dmitry: ---- 8< ---- I've got the following report while running syzkaller fuzzer on 86292b33d4b7 ("Merge branch 'akpm' (patches from Andrew)") [ BUG: held lock freed! ] 4.10.0+ #234 Not tainted ------------------------- syz-executor6/6898 is freeing memory ffff88006286cac0-ffff88006286d3b7, with a lock still held there! (slock-AF_INET6){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (slock-AF_INET6){+.-...}, at: [] sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 5 locks held by syz-executor6/6898: #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock include/net/sock.h:1460 [inline] #0: (sk_lock-AF_INET6){+.+.+.}, at: [] inet_stream_connect+0x44/0xa0 net/ipv4/af_inet.c:681 #1: (rcu_read_lock){......}, at: [] inet6_csk_xmit+0x12a/0x5d0 net/ipv6/inet6_connection_sock.c:126 #2: (rcu_read_lock){......}, at: [] __skb_unlink include/linux/skbuff.h:1767 [inline] #2: (rcu_read_lock){......}, at: [] __skb_dequeue include/linux/skbuff.h:1783 [inline] #2: (rcu_read_lock){......}, at: [] process_backlog+0x264/0x730 net/core/dev.c:4835 #3: (rcu_read_lock){......}, at: [] ip6_input_finish+0x0/0x1700 net/ipv6/ip6_input.c:59 #4: (slock-AF_INET6){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #4: (slock-AF_INET6){+.-...}, at: [] sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 Fix it just like was done by b0691c8ee7c2 ("net: Unlock sock before calling sk_free()"). Reported-by: Dmitry Vyukov Cc: Cong Wang Cc: Eric Dumazet Cc: Gerrit Renker Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170301153510.GE15145@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/minisocks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 53eddf99e4f6eb..d20d948a98ed3c 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -122,6 +122,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); sk_free(newsk); return NULL; } From 94352d45092c23874532221b4d1e4721df9d63df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 1 Mar 2017 16:35:08 -0300 Subject: [PATCH 43/69] net: Introduce sk_clone_lock() error path routine When handling problems in cloning a socket with the sk_clone_locked() function we need to perform several steps that were open coded in it and its callers, so introduce a routine to avoid this duplication: sk_free_unlock_clone(). Cc: Cong Wang Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Gerrit Renker Cc: Thomas Gleixner Link: http://lkml.kernel.org/n/net-ui6laqkotycunhtmqryl9bfx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 16 +++++++++++----- net/dccp/minisocks.c | 6 +----- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9ccefa5c548786..5e5997654db645 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1526,6 +1526,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); +void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index e7d74940e8637d..f6fd79f33097f3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1539,11 +1539,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); newsk = NULL; goto out; } @@ -1592,6 +1588,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) } EXPORT_SYMBOL_GPL(sk_clone_lock); +void sk_free_unlock_clone(struct sock *sk) +{ + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sk->sk_destruct = NULL; + bh_unlock_sock(sk); + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index d20d948a98ed3c..e267e6f4c9a556 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -119,11 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); return NULL; } dccp_init_xmit_timers(newsk); From 7db92362d2fee5887f6b0c41653b8c9f8f5d6020 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 1 Mar 2017 13:29:48 -0800 Subject: [PATCH 44/69] tcp: fix potential double free issue for fastopen_req tp->fastopen_req could potentially be double freed if a malicious user does the following: 1. Enable TCP_FASTOPEN_CONNECT sockopt and do a connect() on the socket. 2. Call connect() with AF_UNSPEC to disconnect the socket. 3. Make this socket a listening socket by calling listen(). 4. Accept incoming connections and generate child sockets. All child sockets will get a copy of the pointer of fastopen_req. 5. Call close() on all sockets. fastopen_req will get freed multiple times. Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Reported-by: Andrey Konovalov Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index da385ae997a3d6..cf4555581282c6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1110,9 +1110,14 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, flags, 1); - inet->defer_connect = 0; - *copied = tp->fastopen_req->copied; - tcp_free_fastopen_req(tp); + /* fastopen_req could already be freed in __inet_stream_connect + * if the connection times out or gets rst + */ + if (tp->fastopen_req) { + *copied = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + inet->defer_connect = 0; + } return err; } @@ -2318,6 +2323,10 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); + /* Clean up fastopen related fields */ + tcp_free_fastopen_req(tp); + inet->defer_connect = 0; + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); From be12502e2e64854dbe0a2ddff6d26ec1143d6890 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 1 Mar 2017 17:24:47 -0800 Subject: [PATCH 45/69] drivers: net: ethernet: remove incorrect __exit markups Even if bus is not hot-pluggable, devices can be unbound from the driver via sysfs, so we should not be using __exit annotations on remove() methods. The only exception is drivers registered with platform_driver_probe() which specifically disables sysfs bind/unbind attributes. Signed-off-by: Dmitry Torokhov Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/declance.c | 30 +++++++++++----------- drivers/net/ethernet/broadcom/sb1250-mac.c | 4 +-- drivers/net/ethernet/faraday/ftgmac100.c | 4 +-- drivers/net/ethernet/faraday/ftmac100.c | 4 +-- drivers/net/ethernet/seeq/sgiseeq.c | 4 +-- drivers/net/ethernet/sgi/meth.c | 4 +-- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 76e5fc7adff519..6c98901f1b8970 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1276,18 +1276,6 @@ static int dec_lance_probe(struct device *bdev, const int type) return ret; } -static void __exit dec_lance_remove(struct device *bdev) -{ - struct net_device *dev = dev_get_drvdata(bdev); - resource_size_t start, len; - - unregister_netdev(dev); - start = to_tc_dev(bdev)->resource.start; - len = to_tc_dev(bdev)->resource.end - start + 1; - release_mem_region(start, len); - free_netdev(dev); -} - /* Find all the lance cards on the system and initialize them */ static int __init dec_lance_platform_probe(void) { @@ -1320,7 +1308,7 @@ static void __exit dec_lance_platform_remove(void) #ifdef CONFIG_TC static int dec_lance_tc_probe(struct device *dev); -static int __exit dec_lance_tc_remove(struct device *dev); +static int dec_lance_tc_remove(struct device *dev); static const struct tc_device_id dec_lance_tc_table[] = { { "DEC ", "PMAD-AA " }, @@ -1334,7 +1322,7 @@ static struct tc_driver dec_lance_tc_driver = { .name = "declance", .bus = &tc_bus_type, .probe = dec_lance_tc_probe, - .remove = __exit_p(dec_lance_tc_remove), + .remove = dec_lance_tc_remove, }, }; @@ -1346,7 +1334,19 @@ static int dec_lance_tc_probe(struct device *dev) return status; } -static int __exit dec_lance_tc_remove(struct device *dev) +static void dec_lance_remove(struct device *bdev) +{ + struct net_device *dev = dev_get_drvdata(bdev); + resource_size_t start, len; + + unregister_netdev(dev); + start = to_tc_dev(bdev)->resource.start; + len = to_tc_dev(bdev)->resource.end - start + 1; + release_mem_region(start, len); + free_netdev(dev); +} + +static int dec_lance_tc_remove(struct device *dev) { put_device(dev); dec_lance_remove(dev); diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c index 89d4feba1a9aea..55c8e25b43d9ad 100644 --- a/drivers/net/ethernet/broadcom/sb1250-mac.c +++ b/drivers/net/ethernet/broadcom/sb1250-mac.c @@ -2617,7 +2617,7 @@ static int sbmac_probe(struct platform_device *pldev) return err; } -static int __exit sbmac_remove(struct platform_device *pldev) +static int sbmac_remove(struct platform_device *pldev) { struct net_device *dev = platform_get_drvdata(pldev); struct sbmac_softc *sc = netdev_priv(dev); @@ -2634,7 +2634,7 @@ static int __exit sbmac_remove(struct platform_device *pldev) static struct platform_driver sbmac_driver = { .probe = sbmac_probe, - .remove = __exit_p(sbmac_remove), + .remove = sbmac_remove, .driver = { .name = sbmac_string, }, diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 262587240c86e5..928b0df2b8e033 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1456,7 +1456,7 @@ static int ftgmac100_probe(struct platform_device *pdev) return err; } -static int __exit ftgmac100_remove(struct platform_device *pdev) +static int ftgmac100_remove(struct platform_device *pdev) { struct net_device *netdev; struct ftgmac100 *priv; @@ -1483,7 +1483,7 @@ MODULE_DEVICE_TABLE(of, ftgmac100_of_match); static struct platform_driver ftgmac100_driver = { .probe = ftgmac100_probe, - .remove = __exit_p(ftgmac100_remove), + .remove = ftgmac100_remove, .driver = { .name = DRV_NAME, .of_match_table = ftgmac100_of_match, diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c index c0ddbbe6c22689..6ac336b546e6c2 100644 --- a/drivers/net/ethernet/faraday/ftmac100.c +++ b/drivers/net/ethernet/faraday/ftmac100.c @@ -1156,7 +1156,7 @@ static int ftmac100_probe(struct platform_device *pdev) return err; } -static int __exit ftmac100_remove(struct platform_device *pdev) +static int ftmac100_remove(struct platform_device *pdev) { struct net_device *netdev; struct ftmac100 *priv; @@ -1176,7 +1176,7 @@ static int __exit ftmac100_remove(struct platform_device *pdev) static struct platform_driver ftmac100_driver = { .probe = ftmac100_probe, - .remove = __exit_p(ftmac100_remove), + .remove = ftmac100_remove, .driver = { .name = DRV_NAME, }, diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index ed34196028b8e8..70347720fdf98a 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -807,7 +807,7 @@ static int sgiseeq_probe(struct platform_device *pdev) return err; } -static int __exit sgiseeq_remove(struct platform_device *pdev) +static int sgiseeq_remove(struct platform_device *pdev) { struct net_device *dev = platform_get_drvdata(pdev); struct sgiseeq_private *sp = netdev_priv(dev); @@ -822,7 +822,7 @@ static int __exit sgiseeq_remove(struct platform_device *pdev) static struct platform_driver sgiseeq_driver = { .probe = sgiseeq_probe, - .remove = __exit_p(sgiseeq_remove), + .remove = sgiseeq_remove, .driver = { .name = "sgiseeq", } diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index 69d2d30e5ef13b..ea55abd62ec709 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -854,7 +854,7 @@ static int meth_probe(struct platform_device *pdev) return 0; } -static int __exit meth_remove(struct platform_device *pdev) +static int meth_remove(struct platform_device *pdev) { struct net_device *dev = platform_get_drvdata(pdev); @@ -866,7 +866,7 @@ static int __exit meth_remove(struct platform_device *pdev) static struct platform_driver meth_driver = { .probe = meth_probe, - .remove = __exit_p(meth_remove), + .remove = meth_remove, .driver = { .name = "meth", } From 9d6acb3bc9058d1fd7a5297d71f14213679bb4bd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 1 Mar 2017 20:48:39 -0800 Subject: [PATCH 46/69] ipv6: ignore null_entry in inet6_rtm_getroute() too Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"), we need to ignore null entry in inet6_rtm_getroute() too. Return -ENETUNREACH here to sync with IPv4 behavior, as suggested by David. Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin down") Reported-by: Dmitry Vyukov Cc: David Ahern Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 43ca90d50ae9b3..229bfcc451ef50 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3632,6 +3632,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } + if (rt == net->ipv6.ip6_null_entry) { + err = rt->dst.error; + ip6_rt_put(rt); + goto errout; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); From 152669bd3cd2407d6f556009b95ee249c0c1a462 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 2 Mar 2017 13:00:53 +0000 Subject: [PATCH 47/69] netvsc: fix use-after-free in netvsc_change_mtu() 'nvdev' is freed in rndis_filter_device_remove -> netvsc_device_remove -> free_netvsc_device, so we mustn't access it, before it's re-created in rndis_filter_device_add -> netvsc_device_add. Signed-off-by: Dexuan Cui Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Reviewed-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 2d3cdb026a9959..bc05c895d9589d 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -859,15 +859,22 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) if (ret) goto out; + memset(&device_info, 0, sizeof(device_info)); + device_info.ring_size = ring_size; + device_info.num_chn = nvdev->num_chn; + device_info.max_num_vrss_chns = nvdev->num_chn; + ndevctx->start_remove = true; rndis_filter_device_remove(hdev, nvdev); + /* 'nvdev' has been freed in rndis_filter_device_remove() -> + * netvsc_device_remove () -> free_netvsc_device(). + * We mustn't access it before it's re-created in + * rndis_filter_device_add() -> netvsc_device_add(). + */ + ndev->mtu = mtu; - memset(&device_info, 0, sizeof(device_info)); - device_info.ring_size = ring_size; - device_info.num_chn = nvdev->num_chn; - device_info.max_num_vrss_chns = nvdev->num_chn; rndis_filter_device_add(hdev, &device_info); out: From 31c05415f5b471fd333fe42629788364faea8e0d Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 2 Mar 2017 12:24:36 -0800 Subject: [PATCH 48/69] bonding: use ETH_MAX_MTU as max mtu This restores the ability of setting bond device's mtu to 9000. Fixes: 91572088e3fd ("net: use core MTU range checking in core net infra") Reported-by: daznis@gmail.com Reported-by: Brad Campbell Cc: Jarod Wilson Signed-off-by: Cong Wang Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6321f12630c8c5..8a4ba8b88e52f9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4179,6 +4179,7 @@ void bond_setup(struct net_device *bond_dev) /* Initialize the device entry points */ ether_setup(bond_dev); + bond_dev->max_mtu = ETH_MAX_MTU; bond_dev->netdev_ops = &bond_netdev_ops; bond_dev->ethtool_ops = &bond_ethtool_ops; From 9f674e48c13dcbc31ac903433727837795b81efe Mon Sep 17 00:00:00 2001 From: Anoob Soman Date: Thu, 2 Mar 2017 10:50:20 +0000 Subject: [PATCH 49/69] xen-netback: Use GFP_ATOMIC to allocate hash Allocation of new_hash, inside xenvif_new_hash(), always happen in softirq context, so use GFP_ATOMIC instead of GFP_KERNEL for new hash allocation. Signed-off-by: Anoob Soman Signed-off-by: David S. Miller --- drivers/net/xen-netback/hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index e8c5dddc54ba27..3c4c58b9fe76ed 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -39,7 +39,7 @@ static void xenvif_add_hash(struct xenvif *vif, const u8 *tag, unsigned long flags; bool found; - new = kmalloc(sizeof(*entry), GFP_KERNEL); + new = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!new) return; From d3aa9c9f212a729e46653d4c1eb6a9ab190efe3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Dec 2016 15:20:34 +0100 Subject: [PATCH 50/69] ixgbe: update the rss key on h/w, when ethtool ask for it Currently ixgbe_set_rxfh() updates the rss_key copy in the driver memory, but does not push the new value into the h/w. This commit add a new helper for the latter operation and call it in ixgbe_set_rxfh(), so that the h/w rss key value can be really updated via ethtool. Signed-off-by: Paolo Abeni Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 +++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 ++++++++++++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index a2cc43d2888801..7a951b11682152 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -929,6 +929,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct ixgbe_adapter *adapter, struct ixgbe_ring *tx_ring); u32 ixgbe_rss_indir_tbl_entries(struct ixgbe_adapter *adapter); +void ixgbe_store_key(struct ixgbe_adapter *adapter); void ixgbe_store_reta(struct ixgbe_adapter *adapter); s32 ixgbe_negotiate_fc(struct ixgbe_hw *hw, u32 adv_reg, u32 lp_reg, u32 adv_sym, u32 adv_asm, u32 lp_sym, u32 lp_asm); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index a7574c7b12af06..90fa5bf23d1b5f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2998,8 +2998,10 @@ static int ixgbe_set_rxfh(struct net_device *netdev, const u32 *indir, } /* Fill out the rss hash key */ - if (key) + if (key) { memcpy(adapter->rss_key, key, ixgbe_get_rxfh_key_size(netdev)); + ixgbe_store_key(adapter); + } ixgbe_store_reta(adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 060cdce8058f9b..67ab13fd163c16 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3473,6 +3473,21 @@ u32 ixgbe_rss_indir_tbl_entries(struct ixgbe_adapter *adapter) return 512; } +/** + * ixgbe_store_key - Write the RSS key to HW + * @adapter: device handle + * + * Write the RSS key stored in adapter.rss_key to HW. + */ +void ixgbe_store_key(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + int i; + + for (i = 0; i < 10; i++) + IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), adapter->rss_key[i]); +} + /** * ixgbe_store_reta - Write the RETA table to HW * @adapter: device handle @@ -3538,7 +3553,6 @@ static void ixgbe_store_vfreta(struct ixgbe_adapter *adapter) static void ixgbe_setup_reta(struct ixgbe_adapter *adapter) { - struct ixgbe_hw *hw = &adapter->hw; u32 i, j; u32 reta_entries = ixgbe_rss_indir_tbl_entries(adapter); u16 rss_i = adapter->ring_feature[RING_F_RSS].indices; @@ -3551,8 +3565,7 @@ static void ixgbe_setup_reta(struct ixgbe_adapter *adapter) rss_i = 4; /* Fill out hash function seeds */ - for (i = 0; i < 10; i++) - IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), adapter->rss_key[i]); + ixgbe_store_key(adapter); /* Fill out redirection table */ memset(adapter->rss_indir_tbl, 0, sizeof(adapter->rss_indir_tbl)); From c74042f3b3ca982652af99cad85252a2655c6064 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 3 Feb 2017 09:19:40 -0800 Subject: [PATCH 51/69] ixgbe: Limit use of 2K buffers on architectures with 256B or larger cache lines On architectures that have a cache line size larger than 64 Bytes we start running into issues where the amount of headroom for the frame starts shrinking. The size of skb_shared_info on a system with a 64B L1 cache line size is 320. This increases to 384 with a 128B cache line, and 512 with a 256B cache line. In addition the NET_SKB_PAD value increases as well consistent with the cache line size. As a result when we get to a 256B cache line as seen on the s390 we end up 768 bytes used by padding and shared info leaving us with only 1280 bytes to use for data storage. On architectures such as this we should default to using 3K Rx buffers out of a 8K page instead of trying to do 1.5K buffers out of a 4K page. To take all of this into account I have added one small check so that we compare the max_frame to the amount of actual data we can store. This was already occurring for igb, but I had overlooked it for ixgbe as it doesn't have strict limits for 82599 once we enable jumbo frames. By adding this check we will automatically enable 3K Rx buffers as soon as the maximum frame size we can handle drops below the standard Ethernet MTU. I also went through and fixed one small typo that I found where I had left an IGB in a variable name due to a copy/paste error. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 7a951b11682152..b1ecc2627a5aee 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -96,7 +96,7 @@ #define IXGBE_MAX_FRAME_BUILD_SKB \ (SKB_WITH_OVERHEAD(IXGBE_RXBUFFER_2K) - IXGBE_SKB_PAD) #else -#define IGB_MAX_FRAME_BUILD_SKB IXGBE_RXBUFFER_2K +#define IXGBE_MAX_FRAME_BUILD_SKB IXGBE_RXBUFFER_2K #endif /* diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 67ab13fd163c16..a7a430a7be2cd9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3972,7 +3972,8 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter) if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state); - if (max_frame > (ETH_FRAME_LEN + ETH_FCS_LEN)) + if ((max_frame > (ETH_FRAME_LEN + ETH_FCS_LEN)) || + (max_frame > IXGBE_MAX_FRAME_BUILD_SKB)) set_bit(__IXGBE_RX_3K_BUFFER, &rx_ring->state); #endif } From c919a3069c775c1c876bec55e00b2305d5125caa Mon Sep 17 00:00:00 2001 From: Ethan Zonca Date: Fri, 24 Feb 2017 11:27:36 -0500 Subject: [PATCH 52/69] can: gs_usb: Don't use stack memory for USB transfers Fixes: 05ca5270005c can: gs_usb: add ethtool set_phys_id callback to locate physical device The gs_usb driver is performing USB transfers using buffers allocated on the stack. This causes the driver to not function with vmapped stacks. Instead, allocate memory for the transfer buffers. Signed-off-by: Ethan Zonca Cc: linux-stable # >= v4.8 Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 40 ++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 77e3cc06a30c8c..a0dabd4038ba35 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -908,10 +908,14 @@ static int gs_usb_probe(struct usb_interface *intf, struct gs_usb *dev; int rc = -ENOMEM; unsigned int icount, i; - struct gs_host_config hconf = { - .byte_order = 0x0000beef, - }; - struct gs_device_config dconf; + struct gs_host_config *hconf; + struct gs_device_config *dconf; + + hconf = kmalloc(sizeof(*hconf), GFP_KERNEL); + if (!hconf) + return -ENOMEM; + + hconf->byte_order = 0x0000beef; /* send host config */ rc = usb_control_msg(interface_to_usbdev(intf), @@ -920,16 +924,22 @@ static int gs_usb_probe(struct usb_interface *intf, USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, 1, intf->altsetting[0].desc.bInterfaceNumber, - &hconf, - sizeof(hconf), + hconf, + sizeof(*hconf), 1000); + kfree(hconf); + if (rc < 0) { dev_err(&intf->dev, "Couldn't send data format (err=%d)\n", rc); return rc; } + dconf = kmalloc(sizeof(*dconf), GFP_KERNEL); + if (!dconf) + return -ENOMEM; + /* read device config */ rc = usb_control_msg(interface_to_usbdev(intf), usb_rcvctrlpipe(interface_to_usbdev(intf), 0), @@ -937,28 +947,33 @@ static int gs_usb_probe(struct usb_interface *intf, USB_DIR_IN|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, 1, intf->altsetting[0].desc.bInterfaceNumber, - &dconf, - sizeof(dconf), + dconf, + sizeof(*dconf), 1000); if (rc < 0) { dev_err(&intf->dev, "Couldn't get device config: (err=%d)\n", rc); + kfree(dconf); return rc; } - icount = dconf.icount + 1; + icount = dconf->icount + 1; dev_info(&intf->dev, "Configuring for %d interfaces\n", icount); if (icount > GS_MAX_INTF) { dev_err(&intf->dev, "Driver cannot handle more that %d CAN interfaces\n", GS_MAX_INTF); + kfree(dconf); return -EINVAL; } dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) + if (!dev) { + kfree(dconf); return -ENOMEM; + } + init_usb_anchor(&dev->rx_submitted); atomic_set(&dev->active_channels, 0); @@ -967,7 +982,7 @@ static int gs_usb_probe(struct usb_interface *intf, dev->udev = interface_to_usbdev(intf); for (i = 0; i < icount; i++) { - dev->canch[i] = gs_make_candev(i, intf, &dconf); + dev->canch[i] = gs_make_candev(i, intf, dconf); if (IS_ERR_OR_NULL(dev->canch[i])) { /* save error code to return later */ rc = PTR_ERR(dev->canch[i]); @@ -978,12 +993,15 @@ static int gs_usb_probe(struct usb_interface *intf, gs_destroy_candev(dev->canch[i]); usb_kill_anchored_urbs(&dev->rx_submitted); + kfree(dconf); kfree(dev); return rc; } dev->canch[i]->parent = dev; } + kfree(dconf); + return 0; } From 540a27aef355e3fd8c598600d4a3c8f92127ee05 Mon Sep 17 00:00:00 2001 From: Ethan Zonca Date: Fri, 24 Feb 2017 11:00:34 -0500 Subject: [PATCH 53/69] can: gs_usb: fix coding style This patch fixes five minor style issues, spaces are between bitwise OR operators. Signed-off-by: Ethan Zonca Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index a0dabd4038ba35..300349fe8dc049 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -258,7 +258,7 @@ static int gs_cmd_reset(struct gs_usb *gsusb, struct gs_can *gsdev) rc = usb_control_msg(interface_to_usbdev(intf), usb_sndctrlpipe(interface_to_usbdev(intf), 0), GS_USB_BREQ_MODE, - USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, gsdev->channel, 0, dm, @@ -432,7 +432,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev) rc = usb_control_msg(interface_to_usbdev(intf), usb_sndctrlpipe(interface_to_usbdev(intf), 0), GS_USB_BREQ_BITTIMING, - USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, dev->channel, 0, dbt, @@ -546,7 +546,6 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, hf, urb->transfer_dma); - if (rc == -ENODEV) { netif_device_detach(netdev); } else { @@ -804,7 +803,7 @@ static struct gs_can *gs_make_candev(unsigned int channel, rc = usb_control_msg(interface_to_usbdev(intf), usb_rcvctrlpipe(interface_to_usbdev(intf), 0), GS_USB_BREQ_BT_CONST, - USB_DIR_IN|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, channel, 0, bt_const, @@ -921,7 +920,7 @@ static int gs_usb_probe(struct usb_interface *intf, rc = usb_control_msg(interface_to_usbdev(intf), usb_sndctrlpipe(interface_to_usbdev(intf), 0), GS_USB_BREQ_HOST_FORMAT, - USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, intf->altsetting[0].desc.bInterfaceNumber, hconf, @@ -944,7 +943,7 @@ static int gs_usb_probe(struct usb_interface *intf, rc = usb_control_msg(interface_to_usbdev(intf), usb_rcvctrlpipe(interface_to_usbdev(intf), 0), GS_USB_BREQ_DEVICE_CONFIG, - USB_DIR_IN|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, intf->altsetting[0].desc.bInterfaceNumber, dconf, From 7c42631376306fb3f34d51fda546b50a9b6dd6ec Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 2 Mar 2017 12:03:40 +0100 Subject: [PATCH 54/69] can: usb_8dev: Fix memory leak of priv->cmd_msg_buffer The priv->cmd_msg_buffer is allocated in the probe function, but never kfree()ed. This patch converts the kzalloc() to resource-managed kzalloc. Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/usb_8dev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 108a30e1509756..d000cb62d6ae8c 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -951,8 +951,8 @@ static int usb_8dev_probe(struct usb_interface *intf, for (i = 0; i < MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = MAX_TX_URBS; - priv->cmd_msg_buffer = kzalloc(sizeof(struct usb_8dev_cmd_msg), - GFP_KERNEL); + priv->cmd_msg_buffer = devm_kzalloc(&intf->dev, sizeof(struct usb_8dev_cmd_msg), + GFP_KERNEL); if (!priv->cmd_msg_buffer) goto cleanup_candev; @@ -966,7 +966,7 @@ static int usb_8dev_probe(struct usb_interface *intf, if (err) { netdev_err(netdev, "couldn't register CAN device: %d\n", err); - goto cleanup_cmd_msg_buffer; + goto cleanup_candev; } err = usb_8dev_cmd_version(priv, &version); @@ -987,9 +987,6 @@ static int usb_8dev_probe(struct usb_interface *intf, cleanup_unregister_candev: unregister_netdev(priv->netdev); -cleanup_cmd_msg_buffer: - kfree(priv->cmd_msg_buffer); - cleanup_candev: free_candev(netdev); From 66ddb82129df66a94219844c509074adb4330a28 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 2 Mar 2017 15:42:49 +0100 Subject: [PATCH 55/69] can: flexcan: fix typo in comment This patch fixes the typo "Disble" -> "Disable". Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index ea57fed375c634..13f0f219d8aa83 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -196,7 +196,7 @@ #define FLEXCAN_QUIRK_BROKEN_ERR_STATE BIT(1) /* [TR]WRN_INT not connected */ #define FLEXCAN_QUIRK_DISABLE_RXFG BIT(2) /* Disable RX FIFO Global mask */ #define FLEXCAN_QUIRK_ENABLE_EACEN_RRS BIT(3) /* Enable EACEN and RRS bit in ctrl2 */ -#define FLEXCAN_QUIRK_DISABLE_MECR BIT(4) /* Disble Memory error detection */ +#define FLEXCAN_QUIRK_DISABLE_MECR BIT(4) /* Disable Memory error detection */ #define FLEXCAN_QUIRK_USE_OFF_TIMESTAMP BIT(5) /* Use timestamp based offloading */ /* Structure of the message buffer */ From da2f27e9e615d1c799c9582b15262458da61fddc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 1 Mar 2017 15:33:26 +0100 Subject: [PATCH 56/69] netfilter: nf_conntrack_sip: fix wrong memory initialisation In commit 82de0be6862cd ("netfilter: Add helper array register/unregister functions"), struct nf_conntrack_helper sip[MAX_PORTS][4] was changed to sip[MAX_PORTS * 4], so the memory init should have been changed to memset(&sip[4 * i], 0, 4 * sizeof(sip[i])); But as the sip[] table is allocated in the BSS, it is already set to 0 Fixes: 82de0be6862cd ("netfilter: Add helper array register/unregister functions") Signed-off-by: Christophe Leroy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 24174c5202398f..0d17894798b5ca 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1628,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void) ports[ports_c++] = SIP_PORT; for (i = 0; i < ports_c; i++) { - memset(&sip[i], 0, sizeof(sip[i])); - nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, SIP_EXPECT_MAX, From f9121355eb6f9babadb97bf5b34ab0cce7764406 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Mar 2017 18:15:11 +0100 Subject: [PATCH 57/69] netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups In case of adjacent ranges, we may indeed see either the high part of the range in first place or the low part of it. Remove this incorrect assumption, let's make sure we annotate the low part of the interval in case of we have adjacent interva intervals so we hit a matching in lookups. Reported-by: Simon Hanisch Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 71e8fb886a73b7..78dfbf9588b368 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; - /* In case of adjacent ranges, we always see the high - * part of the range in first place, before the low one. - * So don't update interval if the keys are equal. - */ - if (interval && nft_rbtree_equal(set, this, interval)) + if (interval && + nft_rbtree_equal(set, this, interval) && + nft_rbtree_interval_end(this) && + !nft_rbtree_interval_end(interval)) continue; interval = rbe; } else if (d > 0) From 25e94a997b324b5f167f56d56d7106d38b78c9de Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Mar 2017 12:52:31 +0100 Subject: [PATCH 58/69] netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() fails The underlying nlmsg_multicast() already sets sk->sk_err for us to notify socket overruns, so we should not do anything with this return value. So we just call nfnetlink_set_err() if: 1) We fail to allocate the netlink message. or 2) We don't have enough space in the netlink message to place attributes, which means that we likely need to allocate a larger message. Before this patch, the internal ESRCH netlink error code was propagated to userspace, which is quite misleading. Netlink semantics mandate that listeners just hit ENOBUFS if the socket buffer overruns. Reported-by: Alexander Alemayhu Tested-by: Alexander Alemayhu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +- net/netfilter/nf_tables_api.c | 133 ++++++++++++------------------ 2 files changed, 58 insertions(+), 81 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ac84686aaafb0b..2aa8a9d80fbe82 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -988,9 +988,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, + int event, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ff7304ae58ac4f..5e0ccfd5bb37d1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -461,16 +461,15 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, return -1; } -static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -482,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_tables(struct sk_buff *skb, @@ -1050,16 +1046,15 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, return -1; } -static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1072,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_chains(struct sk_buff *skb, @@ -1934,18 +1926,16 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, return -1; } -static int nf_tables_rule_notify(const struct nft_ctx *ctx, - const struct nft_rule *rule, - int event) +static void nf_tables_rule_notify(const struct nft_ctx *ctx, + const struct nft_rule *rule, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1958,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } struct nft_rule_dump_ctx { @@ -2696,9 +2683,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, return -1; } -static int nf_tables_set_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - int event, gfp_t gfp_flags) +static void nf_tables_set_notify(const struct nft_ctx *ctx, + const struct nft_set *set, int event, + gfp_t gfp_flags) { struct sk_buff *skb; u32 portid = ctx->portid; @@ -2706,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; @@ -2719,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, - ctx->report, gfp_flags); + nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, + gfp_flags); + return; err: - if (err < 0) - nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) @@ -3504,10 +3489,10 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, return -1; } -static int nf_tables_setelem_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - const struct nft_set_elem *elem, - int event, u16 flags) +static void nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) { struct net *net = ctx->net; u32 portid = ctx->portid; @@ -3515,9 +3500,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -3529,12 +3513,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, - GFP_KERNEL); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); + return; err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, @@ -4476,18 +4459,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return nft_delobj(&ctx, obj); } -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + int family, int report, gfp_t gfp) { struct sk_buff *skb; int err; if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp); if (skb == NULL) goto err; @@ -4499,21 +4481,18 @@ int nft_obj_notify(struct net *net, struct nft_table *table, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + return; err: - if (err < 0) { - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - } - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } EXPORT_SYMBOL_GPL(nft_obj_notify); -static int nf_tables_obj_notify(const struct nft_ctx *ctx, - struct nft_object *obj, int event) +static void nf_tables_obj_notify(const struct nft_ctx *ctx, + struct nft_object *obj, int event) { - return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, - ctx->seq, event, ctx->afi->family, ctx->report, - GFP_KERNEL); + nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, + ctx->afi->family, ctx->report, GFP_KERNEL); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, @@ -4543,7 +4522,8 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, return -EMSGSIZE; } -static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, + int event) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; @@ -4551,9 +4531,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) if (nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) goto err; @@ -4565,14 +4544,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) goto err; } - err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, - NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); + nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + nlmsg_report(nlh), GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + -ENOBUFS); } static int nf_tables_getgen(struct net *net, struct sock *nlsk, From d67ce7da3b1eda838db7bdca86d7ec28ef37068e Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 2 Mar 2017 12:54:25 +0000 Subject: [PATCH 59/69] xen-netback: keep a local pointer for vif in backend_disconnect() This patch replaces use of 'be->vif' with 'vif' and hence generally makes the function look tidier. No semantic change. Signed-off-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/xenbus.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index bb854f92f5a5cd..d82ddc913b4c93 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -492,24 +492,28 @@ static int backend_create_xenvif(struct backend_info *be) static void backend_disconnect(struct backend_info *be) { - if (be->vif) { + struct xenvif *vif = be->vif; + + if (vif) { unsigned int queue_index; - xen_unregister_watchers(be->vif); + xen_unregister_watchers(vif); #ifdef CONFIG_DEBUG_FS - xenvif_debugfs_delif(be->vif); + xenvif_debugfs_delif(vif); #endif /* CONFIG_DEBUG_FS */ - xenvif_disconnect_data(be->vif); - for (queue_index = 0; queue_index < be->vif->num_queues; ++queue_index) - xenvif_deinit_queue(&be->vif->queues[queue_index]); - - spin_lock(&be->vif->lock); - vfree(be->vif->queues); - be->vif->num_queues = 0; - be->vif->queues = NULL; - spin_unlock(&be->vif->lock); - - xenvif_disconnect_ctrl(be->vif); + xenvif_disconnect_data(vif); + for (queue_index = 0; + queue_index < vif->num_queues; + ++queue_index) + xenvif_deinit_queue(&vif->queues[queue_index]); + + spin_lock(&vif->lock); + vfree(vif->queues); + vif->num_queues = 0; + vif->queues = NULL; + spin_unlock(&vif->lock); + + xenvif_disconnect_ctrl(vif); } } From a254d8f9a8a928772ef4608342125ccb35b79d5d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 2 Mar 2017 12:54:26 +0000 Subject: [PATCH 60/69] xen-netback: don't vfree() queues under spinlock This leads to a BUG of the following form: [ 174.512861] switch: port 2(vif3.0) entered disabled state [ 174.522735] BUG: sleeping function called from invalid context at /home/build/linux-linus/mm/vmalloc.c:1441 [ 174.523451] in_atomic(): 1, irqs_disabled(): 0, pid: 28, name: xenwatch [ 174.524131] CPU: 1 PID: 28 Comm: xenwatch Tainted: G W 4.10.0upstream-11073-g4977ab6-dirty #1 [ 174.524819] Hardware name: MSI MS-7680/H61M-P23 (MS-7680), BIOS V17.0 03/14/2011 [ 174.525517] Call Trace: [ 174.526217] show_stack+0x23/0x60 [ 174.526899] dump_stack+0x5b/0x88 [ 174.527562] ___might_sleep+0xde/0x130 [ 174.528208] __might_sleep+0x35/0xa0 [ 174.528840] ? _raw_spin_unlock_irqrestore+0x13/0x20 [ 174.529463] ? __wake_up+0x40/0x50 [ 174.530089] remove_vm_area+0x20/0x90 [ 174.530724] __vunmap+0x1d/0xc0 [ 174.531346] ? delete_object_full+0x13/0x20 [ 174.531973] vfree+0x40/0x80 [ 174.532594] set_backend_state+0x18a/0xa90 [ 174.533221] ? dwc_scan_descriptors+0x24d/0x430 [ 174.533850] ? kfree+0x5b/0xc0 [ 174.534476] ? xenbus_read+0x3d/0x50 [ 174.535101] ? xenbus_read+0x3d/0x50 [ 174.535718] ? xenbus_gather+0x31/0x90 [ 174.536332] ? ___might_sleep+0xf6/0x130 [ 174.536945] frontend_changed+0x6b/0xd0 [ 174.537565] xenbus_otherend_changed+0x7d/0x80 [ 174.538185] frontend_changed+0x12/0x20 [ 174.538803] xenwatch_thread+0x74/0x110 [ 174.539417] ? woken_wake_function+0x20/0x20 [ 174.540049] kthread+0xe5/0x120 [ 174.540663] ? xenbus_printf+0x50/0x50 [ 174.541278] ? __kthread_init_worker+0x40/0x40 [ 174.541898] ret_from_fork+0x21/0x2c [ 174.548635] switch: port 2(vif3.0) entered disabled state This patch defers the vfree() until after the spinlock is released. Reported-by: Juergen Gross Signed-off-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/xenbus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index d82ddc913b4c93..d2d7cd9145b1c2 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -496,6 +496,7 @@ static void backend_disconnect(struct backend_info *be) if (vif) { unsigned int queue_index; + struct xenvif_queue *queues; xen_unregister_watchers(vif); #ifdef CONFIG_DEBUG_FS @@ -508,11 +509,13 @@ static void backend_disconnect(struct backend_info *be) xenvif_deinit_queue(&vif->queues[queue_index]); spin_lock(&vif->lock); - vfree(vif->queues); + queues = vif->queues; vif->num_queues = 0; vif->queues = NULL; spin_unlock(&vif->lock); + vfree(queues); + xenvif_disconnect_ctrl(vif); } } From 16206524f6ea57d6dcd56fe46f9f4a06d4a1b113 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Thu, 2 Mar 2017 17:59:56 -0500 Subject: [PATCH 61/69] net: ethernet: bgmac: init sequence bug Fix a bug in the 'bgmac' driver init sequence that blind writes for init sequence where it should preserve most bits other than the ones it is deliberately manipulating. The code now checks to see if the adapter needs to be brought out of reset (where as before it was doing an IDM write to bring it out of reset regardless of whether it was in reset or not). Also, removed unnecessary usleeps (as there is already a read present to flush the IDM writes). Signed-off-by: Zac Schroff Signed-off-by: Jon Mason Fixes: f6a95a24957 ("net: ethernet: bgmac: Add platform device support") Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bgmac-platform.c | 27 ++++++++++++------- drivers/net/ethernet/broadcom/bgmac.h | 16 +++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index 7b1af950f312f3..da1b8b225eb9d3 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -51,8 +51,7 @@ static void platform_bgmac_idm_write(struct bgmac *bgmac, u16 offset, u32 value) static bool platform_bgmac_clk_enabled(struct bgmac *bgmac) { - if ((bgmac_idm_read(bgmac, BCMA_IOCTL) & - (BCMA_IOCTL_CLK | BCMA_IOCTL_FGC)) != BCMA_IOCTL_CLK) + if ((bgmac_idm_read(bgmac, BCMA_IOCTL) & BGMAC_CLK_EN) != BGMAC_CLK_EN) return false; if (bgmac_idm_read(bgmac, BCMA_RESET_CTL) & BCMA_RESET_CTL_RESET) return false; @@ -61,15 +60,25 @@ static bool platform_bgmac_clk_enabled(struct bgmac *bgmac) static void platform_bgmac_clk_enable(struct bgmac *bgmac, u32 flags) { - bgmac_idm_write(bgmac, BCMA_IOCTL, - (BCMA_IOCTL_CLK | BCMA_IOCTL_FGC | flags)); - bgmac_idm_read(bgmac, BCMA_IOCTL); + u32 val; - bgmac_idm_write(bgmac, BCMA_RESET_CTL, 0); - bgmac_idm_read(bgmac, BCMA_RESET_CTL); - udelay(1); + /* The Reset Control register only contains a single bit to show if the + * controller is currently in reset. Do a sanity check here, just in + * case the bootloader happened to leave the device in reset. + */ + val = bgmac_idm_read(bgmac, BCMA_RESET_CTL); + if (val) { + bgmac_idm_write(bgmac, BCMA_RESET_CTL, 0); + bgmac_idm_read(bgmac, BCMA_RESET_CTL); + udelay(1); + } - bgmac_idm_write(bgmac, BCMA_IOCTL, (BCMA_IOCTL_CLK | flags)); + val = bgmac_idm_read(bgmac, BCMA_IOCTL); + /* Some bits of BCMA_IOCTL set by HW/ATF and should not change */ + val |= flags & ~(BGMAC_AWCACHE | BGMAC_ARCACHE | BGMAC_AWUSER | + BGMAC_ARUSER); + val |= BGMAC_CLK_EN; + bgmac_idm_write(bgmac, BCMA_IOCTL, val); bgmac_idm_read(bgmac, BCMA_IOCTL); udelay(1); } diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index 248727dc62f22c..6d1c6ff1ed963e 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -213,6 +213,22 @@ /* BCMA GMAC core specific IO Control (BCMA_IOCTL) flags */ #define BGMAC_BCMA_IOCTL_SW_CLKEN 0x00000004 /* PHY Clock Enable */ #define BGMAC_BCMA_IOCTL_SW_RESET 0x00000008 /* PHY Reset */ +/* The IOCTL values appear to be different in NS, NSP, and NS2, and do not match + * the values directly above + */ +#define BGMAC_CLK_EN BIT(0) +#define BGMAC_RESERVED_0 BIT(1) +#define BGMAC_SOURCE_SYNC_MODE_EN BIT(2) +#define BGMAC_DEST_SYNC_MODE_EN BIT(3) +#define BGMAC_TX_CLK_OUT_INVERT_EN BIT(4) +#define BGMAC_DIRECT_GMII_MODE BIT(5) +#define BGMAC_CLK_250_SEL BIT(6) +#define BGMAC_AWCACHE (0xf << 7) +#define BGMAC_RESERVED_1 (0x1f << 11) +#define BGMAC_ARCACHE (0xf << 16) +#define BGMAC_AWUSER (0x3f << 20) +#define BGMAC_ARUSER (0x3f << 26) +#define BGMAC_RESERVED BIT(31) /* BCMA GMAC core specific IO status (BCMA_IOST) flags */ #define BGMAC_BCMA_IOST_ATTACHED 0x00000800 From fa42245dff4a5f2f8f208da542acbd80c22f7c65 Mon Sep 17 00:00:00 2001 From: Hari Vyas Date: Thu, 2 Mar 2017 17:59:57 -0500 Subject: [PATCH 62/69] net: ethernet: bgmac: mac address change bug ndo_set_mac_address() passes struct sockaddr * as 2nd parameter to bgmac_set_mac_address() but code assumed u8 *. This caused two bytes chopping and the wrong mac address was configured. Signed-off-by: Hari Vyas Signed-off-by: Jon Mason Fixes: 4e209001b86 ("bgmac: write mac address to hardware in ndo_set_mac_address") Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bgmac.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 415046750bb449..fd66fca00e0177 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1223,12 +1223,16 @@ static netdev_tx_t bgmac_start_xmit(struct sk_buff *skb, static int bgmac_set_mac_address(struct net_device *net_dev, void *addr) { struct bgmac *bgmac = netdev_priv(net_dev); + struct sockaddr *sa = addr; int ret; ret = eth_prepare_mac_addr_change(net_dev, addr); if (ret < 0) return ret; - bgmac_write_mac_address(bgmac, (u8 *)addr); + + ether_addr_copy(net_dev->dev_addr, sa->sa_data); + bgmac_write_mac_address(bgmac, net_dev->dev_addr); + eth_commit_mac_addr_change(net_dev, addr); return 0; } From 9383b33771e566fa547daa2d09c6e0f1aaa298c3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Mar 2017 15:26:20 -0800 Subject: [PATCH 63/69] nfp: don't tell FW about the reserved buffer space Since commit c0f031bc8866 ("nfp_net: use alloc_frag() and build_skb()") we are allocating buffers which have to hold both the data and skb to be created in place by build_skb(). FW should only be told about the buffer space it can DMA to, that is without the build_skb() headroom and tailroom. Note: firmware applications should validate the buffers against both MTU and free list buffer size so oversized packets would not pass through the NIC anyway. Fixes: c0f031bc8866 ("nfp: use alloc_frag() and build_skb()") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 074259cc8e066d..00a83218857a97 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2198,7 +2198,8 @@ static int __nfp_net_set_config_and_enable(struct nfp_net *nn) nfp_net_write_mac_addr(nn); nn_writel(nn, NFP_NET_CFG_MTU, nn->netdev->mtu); - nn_writel(nn, NFP_NET_CFG_FLBUFSZ, nn->fl_bufsz); + nn_writel(nn, NFP_NET_CFG_FLBUFSZ, + nn->fl_bufsz - NFP_NET_RX_BUF_NON_DATA); /* Enable device */ new_ctrl |= NFP_NET_CFG_CTRL_ENABLE; From d58cebb79b62ff84b537a35423b8d6b7f0746985 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Mar 2017 15:26:21 -0800 Subject: [PATCH 64/69] nfp: correct DMA direction in XDP DMA sync dma_sync_single_for_*() takes the direction in which the buffer was mapped, not the direction of the sync. We should sync XDP buffers bidirectionally. Fixes: ecd63a0217d5 ("nfp: add XDP support in the driver") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 00a83218857a97..9179a99563afa8 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1498,7 +1498,7 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring, txbuf->real_len = pkt_len; dma_sync_single_for_device(&nn->pdev->dev, rxbuf->dma_addr + pkt_off, - pkt_len, DMA_TO_DEVICE); + pkt_len, DMA_BIDIRECTIONAL); /* Build TX descriptor */ txd = &tx_ring->txds[wr_idx]; @@ -1611,7 +1611,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) dma_sync_single_for_cpu(&nn->pdev->dev, rxbuf->dma_addr + pkt_off, - pkt_len, DMA_FROM_DEVICE); + pkt_len, DMA_BIDIRECTIONAL); act = nfp_net_run_xdp(xdp_prog, rxbuf->frag + data_off, pkt_len); switch (act) { From 37411cad633f5e41f8a13007654909d21b19363a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Mar 2017 23:48:52 +0000 Subject: [PATCH 65/69] rxrpc: Fix potential NULL-pointer exception Fix a potential NULL-pointer exception in rxrpc_do_sendmsg(). The call state check that I added should have gone into the else-body of the if-statement where we actually have a call to check. Found by CoverityScan CID#1414316 ("Dereference after null check"). Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Reported-by: Colin Ian King Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/sendmsg.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 31c1538c1a8de6..27685d8cba1aef 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -517,13 +517,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = -EBADSLT; if (cmd != RXRPC_CMD_SEND_DATA) goto error_release_sock; - ret = -EBUSY; - if (call->state == RXRPC_CALL_UNINITIALISED || - call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || - call->state == RXRPC_CALL_SERVER_PREALLOC || - call->state == RXRPC_CALL_SERVER_SECURING || - call->state == RXRPC_CALL_SERVER_ACCEPTING) - goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, exclusive); /* The socket is now unlocked... */ @@ -531,6 +524,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); /* ... and we have the call lock. */ } else { + ret = -EBUSY; + if (call->state == RXRPC_CALL_UNINITIALISED || + call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || + call->state == RXRPC_CALL_SERVER_PREALLOC || + call->state == RXRPC_CALL_SERVER_SECURING || + call->state == RXRPC_CALL_SERVER_ACCEPTING) + goto error_release_sock; + ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { From a8d63a53b3eac8626f62336dcc327c18f1bbca78 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 3 Mar 2017 00:44:26 -0500 Subject: [PATCH 66/69] rds: remove unnecessary returned value check The function rds_trans_register always returns 0. As such, it is not necessary to check the returned value. Cc: Joe Jin Cc: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 6 +----- net/rds/rds.h | 2 +- net/rds/tcp.c | 6 +----- net/rds/transport.c | 4 +--- 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 0f557b24331121..7a64c8db81abdc 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -438,16 +438,12 @@ int rds_ib_init(void) if (ret) goto out_sysctl; - ret = rds_trans_register(&rds_ib_transport); - if (ret) - goto out_recv; + rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); goto out; -out_recv: - rds_ib_recv_exit(); out_sysctl: rds_ib_sysctl_exit(); out_ibreg: diff --git a/net/rds/rds.h b/net/rds/rds.h index 07fff73dd4f3f9..6f523ddfe1fb35 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -910,7 +910,7 @@ void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ -int rds_trans_register(struct rds_transport *trans); +void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); void rds_trans_put(struct rds_transport *trans); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 5438f6725092b7..a973d3b4dff0b2 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -652,16 +652,12 @@ static int rds_tcp_init(void) if (ret) goto out_pernet; - ret = rds_trans_register(&rds_tcp_transport); - if (ret) - goto out_recv; + rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; -out_recv: - rds_tcp_recv_exit(); out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); out_notifier: diff --git a/net/rds/transport.c b/net/rds/transport.c index 2ffd3e30c6434e..0b188dd0a344cb 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -40,7 +40,7 @@ static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); -int rds_trans_register(struct rds_transport *trans) +void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); @@ -55,8 +55,6 @@ int rds_trans_register(struct rds_transport *trans) } up_write(&rds_trans_sem); - - return 0; } EXPORT_SYMBOL_GPL(rds_trans_register); From d0346b033899b9affa4da8c32bfb574dfb89859e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 3 Mar 2017 15:22:09 +0000 Subject: [PATCH 67/69] sfc: avoid max() in array size It confuses sparse, which thinks the size isn't constant. Let's achieve the same thing with a BUILD_BUG_ON, since we know which one should be bigger and don't expect them ever to change. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 92e1c6d8b2937e..4d88e8532d3ee4 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -828,9 +828,7 @@ static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n) static int efx_ef10_link_piobufs(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; - _MCDI_DECLARE_BUF(inbuf, - max(MC_CMD_LINK_PIOBUF_IN_LEN, - MC_CMD_UNLINK_PIOBUF_IN_LEN)); + MCDI_DECLARE_BUF(inbuf, MC_CMD_LINK_PIOBUF_IN_LEN); struct efx_channel *channel; struct efx_tx_queue *tx_queue; unsigned int offset, index; @@ -839,8 +837,6 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx) BUILD_BUG_ON(MC_CMD_LINK_PIOBUF_OUT_LEN != 0); BUILD_BUG_ON(MC_CMD_UNLINK_PIOBUF_OUT_LEN != 0); - memset(inbuf, 0, sizeof(inbuf)); - /* Link a buffer to each VI in the write-combining mapping */ for (index = 0; index < nic_data->n_piobufs; ++index) { MCDI_SET_DWORD(inbuf, LINK_PIOBUF_IN_PIOBUF_HANDLE, @@ -920,6 +916,10 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx) return 0; fail: + /* inbuf was defined for MC_CMD_LINK_PIOBUF. We can use the same + * buffer for MC_CMD_UNLINK_PIOBUF because it's shorter. + */ + BUILD_BUG_ON(MC_CMD_LINK_PIOBUF_IN_LEN < MC_CMD_UNLINK_PIOBUF_IN_LEN); while (index--) { MCDI_SET_DWORD(inbuf, UNLINK_PIOBUF_IN_TXQ_INSTANCE, nic_data->pio_write_vi_base + index); From 6d43131c158f42e2138d83495d19c70ed5cc0ffe Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 3 Mar 2017 15:22:27 +0000 Subject: [PATCH 68/69] sfc: fix IPID endianness in TSOv2 The value we read from the header is in network byte order, whereas EFX_POPULATE_QWORD_* takes values in host byte order (which it then converts to little-endian, as MCDI is little-endian). Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 4d88e8532d3ee4..c60c2d4c646a89 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2183,7 +2183,7 @@ static int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue, /* Modify IPv4 header if needed. */ ip->tot_len = 0; ip->check = 0; - ipv4_id = ip->id; + ipv4_id = ntohs(ip->id); } else { /* Modify IPv6 header if needed. */ struct ipv6hdr *ipv6 = ipv6_hdr(skb); From f78ef7cd9a0686b979679d0de061c6dbfd8d649e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 3 Mar 2017 12:21:14 -0800 Subject: [PATCH 69/69] strparser: destroy workqueue on module exit Fixes: 43a0c6751a32 ("strparser: Stream parser for messages") Cc: Tom Herbert Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/strparser/strparser.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 41adf362936d7d..b5c279b2268017 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -504,6 +504,7 @@ static int __init strp_mod_init(void) static void __exit strp_mod_exit(void) { + destroy_workqueue(strp_wq); } module_init(strp_mod_init); module_exit(strp_mod_exit);