From 37287d2eef8a361f89eae7867f794433f5ba18fd Mon Sep 17 00:00:00 2001 From: Sagar Balani Date: Fri, 5 Oct 2018 09:44:08 -0700 Subject: [PATCH] [barefoot]: Fix SONiC Build for BFN platforms (#2124) * BFN fixes for sonic master bld (#26) * Need to still update sde deb package with sai1.3.3, will be done in seperate commit * Update bfn sde url - sai1.3.3 --- platform/barefoot/bfn-platform.mk | 2 +- platform/barefoot/bfn-sai.mk | 2 +- .../docker-syncd-bfn-rpc/Dockerfile.j2 | 1 + .../barefoot/docker-syncd-bfn/Dockerfile.j2 | 2 +- .../debian/control | 2 +- .../sonic-platform-modules-bfn/debian/control | 2 +- .../modules/bf_tun.c | 942 +++++++++++------- .../modules/bf_tun.c | 942 +++++++++++------- 8 files changed, 1168 insertions(+), 727 deletions(-) diff --git a/platform/barefoot/bfn-platform.mk b/platform/barefoot/bfn-platform.mk index 6f1e6e302961..72315b1ca553 100644 --- a/platform/barefoot/bfn-platform.mk +++ b/platform/barefoot/bfn-platform.mk @@ -1,5 +1,5 @@ BFN_PLATFORM = bfnplatform_1.0.0_amd64.deb -$(BFN_PLATFORM)_URL = "https://github.com/barefootnetworks/sonic-release-pkgs/raw/rel_8_2/bfnplatform_1.0.0_amd64.deb" +$(BFN_PLATFORM)_URL = "https://github.com/barefootnetworks/sonic-release-pkgs/raw/sde-sai1.3.3/bfnplatform_1.0.0_amd64.deb" SONIC_ONLINE_DEBS += $(BFN_PLATFORM) # $(BFN_SAI_DEV) $(BFN_SAI_DEV)_DEPENDS += $(BFN_PLATFORM) diff --git a/platform/barefoot/bfn-sai.mk b/platform/barefoot/bfn-sai.mk index 7e507d15a9fe..34c9f5ffc244 100644 --- a/platform/barefoot/bfn-sai.mk +++ b/platform/barefoot/bfn-sai.mk @@ -1,5 +1,5 @@ BFN_SAI = bfnsdk_1.0.0_amd64.deb -$(BFN_SAI)_URL = "https://github.com/barefootnetworks/sonic-release-pkgs/raw/rel_8_2/bfnsdk_1.0.0_amd64.deb" +$(BFN_SAI)_URL = "https://github.com/barefootnetworks/sonic-release-pkgs/raw/sde-sai1.3.3/bfnsdk_1.0.0_amd64.deb" SONIC_ONLINE_DEBS += $(BFN_SAI) # $(BFN_SAI_DEV) $(BFN_SAI_DEV)_DEPENDS += $(BFN_SAI) diff --git a/platform/barefoot/docker-syncd-bfn-rpc/Dockerfile.j2 b/platform/barefoot/docker-syncd-bfn-rpc/Dockerfile.j2 index 9efeeb3f6b51..68604d92f6b5 100644 --- a/platform/barefoot/docker-syncd-bfn-rpc/Dockerfile.j2 +++ b/platform/barefoot/docker-syncd-bfn-rpc/Dockerfile.j2 @@ -27,6 +27,7 @@ RUN apt-get update \ python-dev \ wget \ cmake \ + libpython3.4 \ && wget https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz \ && tar xvfz 1.0.0.tar.gz \ && cd nanomsg-1.0.0 \ diff --git a/platform/barefoot/docker-syncd-bfn/Dockerfile.j2 b/platform/barefoot/docker-syncd-bfn/Dockerfile.j2 index 3e31e4b94417..25c7872f6460 100755 --- a/platform/barefoot/docker-syncd-bfn/Dockerfile.j2 +++ b/platform/barefoot/docker-syncd-bfn/Dockerfile.j2 @@ -11,7 +11,7 @@ debs/{{ deb }}{{' '}} {%- endfor -%} debs/ -RUN apt-get install -y libxml2 libpcap-dev libusb-1.0-0-dev libcurl3 libcurl4-gnutls-dev libunwind8-dev +RUN apt-get install -y libxml2 libpcap-dev libusb-1.0-0-dev libcurl3 libcurl4-gnutls-dev libunwind8-dev libpython3.4 RUN dpkg -i \ {% for deb in docker_syncd_bfn_debs.split(' ') -%} diff --git a/platform/barefoot/sonic-platform-modules-bfn-montara/debian/control b/platform/barefoot/sonic-platform-modules-bfn-montara/debian/control index 2b1a9804baea..589f03d2f484 100644 --- a/platform/barefoot/sonic-platform-modules-bfn-montara/debian/control +++ b/platform/barefoot/sonic-platform-modules-bfn-montara/debian/control @@ -7,6 +7,6 @@ Standards-Version: 3.9.3 Package: platform-modules-bfn-montara Architecture: amd64 -Depends: linux-image-3.16.0-5-amd64 +Depends: linux-image-4.9.0-7-amd64 Description: kernel modules for platform devices such as fan, led, sfp diff --git a/platform/barefoot/sonic-platform-modules-bfn/debian/control b/platform/barefoot/sonic-platform-modules-bfn/debian/control index 09d0cc82d2c3..d4fd702298d3 100644 --- a/platform/barefoot/sonic-platform-modules-bfn/debian/control +++ b/platform/barefoot/sonic-platform-modules-bfn/debian/control @@ -7,6 +7,6 @@ Standards-Version: 3.9.3 Package: platform-modules-bfn Architecture: amd64 -Depends: linux-image-3.16.0-5-amd64 +Depends: linux-image-4.9.0-7-amd64 Description: kernel modules for platform devices such as fan, led, sfp diff --git a/platform/barefoot/sonic-platform-modules-bfn/modules/bf_tun.c b/platform/barefoot/sonic-platform-modules-bfn/modules/bf_tun.c index a1ba7047baaa..abb906a24618 100644 --- a/platform/barefoot/sonic-platform-modules-bfn/modules/bf_tun.c +++ b/platform/barefoot/sonic-platform-modules-bfn/modules/bf_tun.c @@ -65,20 +65,21 @@ #include #include #include -#include #include #include #include #include #include +#include +#include #include +#define TUN_MINOR1 201 + /* Uncomment to enable debugging */ /* #define TUN_DEBUG 1 */ -#define TUN_MINOR1 201 - #ifdef TUN_DEBUG static int debug; @@ -105,6 +106,18 @@ do { \ } while (0) #endif +/* TUN device flags */ + +/* IFF_ATTACH_QUEUE is never stored in device flags, + * overload it to mean fasync when stored there. + */ +#define TUN_FASYNC IFF_ATTACH_QUEUE +/* High bits in flags field are unused. */ +#define TUN_VNET_LE 0x80000000 +#define TUN_VNET_BE 0x40000000 + +#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ + IFF_MULTI_QUEUE) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -114,14 +127,24 @@ struct tap_filter { unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; -/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues allocated for - * the netdevice to be fit in one page. So we can make sure the success of - * memory allocation. TODO: increase the limit. */ -#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES +/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal + * to max number of VCPUs in guest. */ +#define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) +struct tun_pcpu_stats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 rx_dropped; + u32 tx_dropped; + u32 rx_frame_errors; +}; + /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and @@ -138,7 +161,6 @@ struct tun_file { struct socket socket; struct socket_wq wq; struct tun_struct __rcu *tun; - struct net *net; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; @@ -148,6 +170,7 @@ struct tun_file { }; struct list_head next; struct tun_struct *detached; + struct skb_array tx_array; }; struct tun_flow_entry { @@ -179,6 +202,7 @@ struct tun_struct { #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6|NETIF_F_UFO) + int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; @@ -196,8 +220,73 @@ struct tun_struct { struct list_head disabled; void *security; u32 flow_count; + struct tun_pcpu_stats __percpu *pcpu_stats; }; +#ifdef CONFIG_TUN_VNET_CROSS_LE +static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) +{ + return tun->flags & TUN_VNET_BE ? false : + virtio_legacy_is_little_endian(); +} + +static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) +{ + int be = !!(tun->flags & TUN_VNET_BE); + + if (put_user(be, argp)) + return -EFAULT; + + return 0; +} + +static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) +{ + int be; + + if (get_user(be, argp)) + return -EFAULT; + + if (be) + tun->flags |= TUN_VNET_BE; + else + tun->flags &= ~TUN_VNET_BE; + + return 0; +} +#else +static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) +{ + return virtio_legacy_is_little_endian(); +} + +static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) +{ + return -EINVAL; +} + +static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) +{ + return -EINVAL; +} +#endif /* CONFIG_TUN_VNET_CROSS_LE */ + +static inline bool tun_is_little_endian(struct tun_struct *tun) +{ + return tun->flags & TUN_VNET_LE || + tun_legacy_is_little_endian(tun); +} + +static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) +{ + return __virtio16_to_cpu(tun_is_little_endian(tun), val); +} + +static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) +{ + return __cpu_to_virtio16(tun_is_little_endian(tun), val); +} + static inline u32 tun_hashfn(u32 rxhash) { return rxhash & 0x3ff; @@ -238,7 +327,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); - sock_rps_reset_flow_hash(e->rps_rxhash); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; @@ -355,10 +443,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { - if (unlikely(e->rps_rxhash != hash)) { - sock_rps_reset_flow_hash(e->rps_rxhash); + if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; - } } /* We try to identify a flow through its rxhash first. The reason that @@ -433,10 +519,22 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) static void tun_queue_purge(struct tun_file *tfile) { - skb_queue_purge(&tfile->sk.sk_receive_queue); + struct sk_buff *skb; + + while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) + kfree_skb(skb); + skb_queue_purge(&tfile->sk.sk_error_queue); } +static void tun_cleanup_tx_array(struct tun_file *tfile) +{ + if (tfile->tx_array.ring.queue) { + skb_array_cleanup(&tfile->tx_array); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + } +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -474,14 +572,12 @@ static void __tun_detach(struct tun_file *tfile, bool clean) if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); - if (!(tun->flags & TUN_PERSIST) && + if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } - - BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, - &tfile->socket.flags)); - sk_release_kernel(&tfile->sk); + tun_cleanup_tx_array(tfile); + sock_put(&tfile->sk); } } @@ -501,11 +597,13 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); + tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { + tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } @@ -517,21 +615,24 @@ static void tun_detach_all(struct net_device *dev) /* Drop read queue */ tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } BUG_ON(tun->numdisabled != 0); - if (tun->flags & TUN_PERSIST) + if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; + struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); @@ -543,7 +644,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte goto out; err = -EBUSY; - if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1) + if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; @@ -555,12 +656,21 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { - err = __sk_attach_filter(&tun->fprog, tfile->socket.sk, - lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + err = sk_attach_filter(&tun->fprog, tfile->socket.sk); + release_sock(tfile->socket.sk); if (!err) goto out; } + + if (!tfile->detached && + skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) { + err = -ENOMEM; + goto out; + } + tfile->queue_index = tun->numqueues; + tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; @@ -632,14 +742,9 @@ static int update_filter(struct tap_filter *filter, void __user *arg) } alen = ETH_ALEN * uf.count; - addr = kmalloc(alen, GFP_KERNEL); - if (!addr) - return -ENOMEM; - - if (copy_from_user(addr, arg + sizeof(uf), alen)) { - err = -EFAULT; - goto done; - } + addr = memdup_user(arg + sizeof(uf), alen); + if (IS_ERR(addr)) + return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst @@ -659,7 +764,7 @@ static int update_filter(struct tap_filter *filter, void __user *arg) for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ - goto done; + goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } @@ -675,8 +780,7 @@ static int update_filter(struct tap_filter *filter, void __user *arg) /* Return the number of exact filters */ err = nexact; - -done: +free_addr: kfree(addr); return err; } @@ -726,7 +830,18 @@ static void tun_net_uninit(struct net_device *dev) /* Net device open. */ static int tun_net_open(struct net_device *dev) { + struct tun_struct *tun = netdev_priv(dev); + int i; + netif_tx_start_all_queues(dev); + + for (i = 0; i < tun->numqueues; i++) { + struct tun_file *tfile; + + tfile = rtnl_dereference(tun->tfiles[i]); + tfile->socket.sk->sk_write_space(tfile->socket.sk); + } + return 0; } @@ -753,7 +868,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (txq >= numqueues) goto drop; - if (numqueues == 1) { +#ifdef CONFIG_RPS + if (numqueues == 1 && static_key_false(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ @@ -768,6 +884,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) tun_flow_save_rps_rxhash(e, rxhash); } } +#endif tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); @@ -793,10 +910,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - if (skb->sk) { - sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); - sw_tx_timestamp(skb); - } + skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. @@ -805,8 +919,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); - /* Enqueue packet */ - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); + if (skb_array_produce(&tfile->tx_array, skb)) + goto drop; /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) @@ -817,11 +931,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; drop: - dev->stats.tx_dropped++; + this_cpu_inc(tun->pcpu_stats->tx_dropped); skb_tx_error(skb); kfree_skb(skb); rcu_read_unlock(); - return NETDEV_TX_OK; + return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) @@ -852,16 +966,6 @@ static netdev_features_t tun_net_fix_features(struct net_device *dev, return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } - -static int -tun_change_carrier(struct net_device *dev, bool new_carrier) { - if (new_carrier) - netif_carrier_on(dev); - else - netif_carrier_off(dev); - return 0; -} - #ifdef CONFIG_NET_POLL_CONTROLLER static void tun_poll_controller(struct net_device *dev) { @@ -879,6 +983,63 @@ static void tun_poll_controller(struct net_device *dev) return; } #endif + +static void tun_set_headroom(struct net_device *dev, int new_hr) +{ + struct tun_struct *tun = netdev_priv(dev); + + if (new_hr < NET_SKB_PAD) + new_hr = NET_SKB_PAD; + + tun->align = new_hr; +} + +static struct rtnl_link_stats64 * +tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0; + struct tun_struct *tun = netdev_priv(dev); + struct tun_pcpu_stats *p; + int i; + + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, txpackets, txbytes; + unsigned int start; + + p = per_cpu_ptr(tun->pcpu_stats, i); + do { + start = u64_stats_fetch_begin(&p->syncp); + rxpackets = p->rx_packets; + rxbytes = p->rx_bytes; + txpackets = p->tx_packets; + txbytes = p->tx_bytes; + } while (u64_stats_fetch_retry(&p->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->tx_packets += txpackets; + stats->tx_bytes += txbytes; + + /* u32 counters */ + rx_dropped += p->rx_dropped; + rx_frame_errors += p->rx_frame_errors; + tx_dropped += p->tx_dropped; + } + stats->rx_dropped = rx_dropped; + stats->rx_frame_errors = rx_frame_errors; + stats->tx_dropped = tx_dropped; + return stats; +} + +static int +tun_change_carrier(struct net_device *dev, bool new_carrier) { + if (new_carrier) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + return 0; +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -890,6 +1051,9 @@ static const struct net_device_ops tun_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif + .ndo_set_rx_headroom = tun_set_headroom, + .ndo_get_stats64 = tun_net_get_stats64, + .ndo_change_carrier = tun_change_carrier, }; static const struct net_device_ops tap_netdev_ops = { @@ -906,7 +1070,9 @@ static const struct net_device_ops tap_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif - .ndo_change_carrier = tun_change_carrier, + .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = tun_set_headroom, + .ndo_get_stats64 = tun_net_get_stats64, }; static void tun_flow_init(struct tun_struct *tun) @@ -934,7 +1100,7 @@ static void tun_net_init(struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: + case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; /* Point-to-Point TUN Device */ @@ -945,10 +1111,9 @@ static void tun_net_init(struct net_device *dev) /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->tx_queue_len = TUN_READQ_SIZE*2; /* We prefer our own queue length */ break; - case TUN_TAP_DEV: + case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); @@ -957,7 +1122,6 @@ static void tun_net_init(struct net_device *dev) eth_hw_addr_random(dev); - dev->tx_queue_len = TUN_READQ_SIZE*2; /* We prefer our own queue length */ break; } } @@ -981,12 +1145,13 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_array_empty(&tfile->tx_array)) mask |= POLLIN | POLLRDNORM; - if (sock_writeable(sk) || - (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) && - sock_writeable(sk))) + if (tun->dev->flags & IFF_UP && + (sock_writeable(sk) || + (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && + sock_writeable(sk)))) mask |= POLLOUT | POLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) @@ -1025,121 +1190,127 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, - void *msg_control, const struct iovec *iv, - size_t total_len, size_t count, int noblock) + void *msg_control, struct iov_iter *from, + int noblock) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; - size_t len = total_len, align = NET_SKB_PAD, linear; + size_t total_len = iov_iter_count(from); + size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; + struct tun_pcpu_stats *stats; int good_linear; - int offset = 0; int copylen; bool zerocopy = false; int err; u32 rxhash; + ssize_t n; + + if (!(tun->dev->flags & IFF_UP)) + return -EIO; - if (!(tun->flags & TUN_NO_PI)) { + if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); - if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) + n = copy_from_iter(&pi, sizeof(pi), from); + if (n != sizeof(pi)) return -EFAULT; - offset += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { - if (len < tun->vnet_hdr_sz) + if (tun->flags & IFF_VNET_HDR) { + int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + + if (len < vnet_hdr_sz) return -EINVAL; - len -= tun->vnet_hdr_sz; + len -= vnet_hdr_sz; - if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) + n = copy_from_iter(&gso, sizeof(gso), from); + if (n != sizeof(gso)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - gso.csum_start + gso.csum_offset + 2 > gso.hdr_len) - gso.hdr_len = gso.csum_start + gso.csum_offset + 2; + tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) + gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); - if (gso.hdr_len > len) + if (tun16_to_cpu(tun, gso.hdr_len) > len) return -EINVAL; - offset += tun->vnet_hdr_sz; + iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { + if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || - (gso.hdr_len && gso.hdr_len < ETH_HLEN))) + (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { + struct iov_iter i = *from; + /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ - copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN; + copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; - if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS) + iov_iter_advance(&i, copylen); + if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; - if (gso.hdr_len > good_linear) + if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) linear = good_linear; else - linear = gso.hdr_len; + linear = tun16_to_cpu(tun, gso.hdr_len); } skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); return PTR_ERR(skb); } if (zerocopy) - err = zerocopy_sg_from_iovec(skb, iv, offset, count); - else { - err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); - if (!err && msg_control) { - struct ubuf_info *uarg = msg_control; - uarg->callback(uarg, false); - } - } + err = zerocopy_sg_from_iter(skb, from); + else + err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); return -EFAULT; } - if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (!skb_partial_csum_set(skb, gso.csum_start, - gso.csum_offset)) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } + err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun)); + if (err) { + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); + kfree_skb(skb); + return -EINVAL; } switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: - if (tun->flags & TUN_NO_PI) { - switch (skb->data[0] & 0xf0) { - case 0x40: + case IFF_TUN: + if (tun->flags & IFF_NO_PI) { + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: pi.proto = htons(ETH_P_IP); break; - case 0x60: + case 6: pi.proto = htons(ETH_P_IPV6); break; default: - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); return -EINVAL; } @@ -1149,69 +1320,39 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb->protocol = pi.proto; skb->dev = tun->dev; break; - case TUN_TAP_DEV: + case IFF_TAP: skb->protocol = eth_type_trans(skb, tun->dev); break; } - skb_reset_network_header(skb); - - if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - pr_debug("GSO!\n"); - switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV6: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; - break; - case VIRTIO_NET_HDR_GSO_UDP: - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - if (skb->protocol == htons(ETH_P_IPV6)) - ipv6_proxy_select_ident(skb); - break; - default: - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } - - if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - skb_shinfo(skb)->gso_size = gso.gso_size; - if (skb_shinfo(skb)->gso_size == 0) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } - - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; - } - /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + } else if (msg_control) { + struct ubuf_info *uarg = msg_control; + uarg->callback(uarg, false); } + skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); rxhash = skb_get_hash(skb); netif_rx_ni(skb); - tun->dev->stats.rx_packets++; - tun->dev->stats.rx_bytes += len; + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(stats); tun_flow_update(tun, rxhash, tfile); return total_len; } -static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_struct *tun = tun_get(file); @@ -1221,10 +1362,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, if (!tun) return -EBADFD; - tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); - - result = tun_get_user(tun, tfile, NULL, iv, iov_length(iv, count), - count, file->f_flags & O_NONBLOCK); + result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK); tun_put(tun); return result; @@ -1234,167 +1372,183 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + struct tun_pcpu_stats *stats; + ssize_t total; + int vlan_offset = 0; int vlan_hlen = 0; + int vnet_hdr_sz = 0; - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; - if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (tun->flags & IFF_VNET_HDR) + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + + total = skb->len + vlan_hlen + vnet_hdr_sz; + + if (!(tun->flags & IFF_NO_PI)) { + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len) { + total += sizeof(pi); + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; - total += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { + if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= tun->vnet_hdr_sz) < 0) + int ret; + + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; - if (skb_is_gso(skb)) { + ret = virtio_net_hdr_from_skb(skb, &gso, + tun_is_little_endian(tun), true); + if (ret) { struct skb_shared_info *sinfo = skb_shinfo(skb); + pr_err("unexpected GSO type: " + "0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + WARN_ON_ONCE(1); + return -EINVAL; + } - /* This is a hint as to how much should be linear. */ - gso.hdr_len = skb_headlen(skb); - gso.gso_size = sinfo->gso_size; - if (sinfo->gso_type & SKB_GSO_TCPV4) - gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - gso.gso_type = VIRTIO_NET_HDR_GSO_UDP; - else { - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, gso.gso_size, - gso.hdr_len); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)gso.hdr_len, 64), true); - WARN_ON_ONCE(1); - return -EINVAL; - } - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - gso.gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - gso.csum_start = skb_checksum_start_offset(skb) + - vlan_hlen; - gso.csum_offset = skb->csum_offset; - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ - - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; - total += tun->vnet_hdr_sz; + + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; - veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: - tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + /* caller is in process context, */ + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += skb->len + vlan_hlen; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(tun->pcpu_stats); return total; } +static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, + int *err) +{ + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb = NULL; + int error = 0; + + skb = skb_array_consume(&tfile->tx_array); + if (skb) + goto out; + if (noblock) { + error = -EAGAIN; + goto out; + } + + add_wait_queue(&tfile->wq.wait, &wait); + current->state = TASK_INTERRUPTIBLE; + + while (1) { + skb = skb_array_consume(&tfile->tx_array); + if (skb) + break; + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { + error = -EFAULT; + break; + } + + schedule(); + } + + current->state = TASK_RUNNING; + remove_wait_queue(&tfile->wq.wait, &wait); + +out: + *err = error; + return skb; +} + static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + struct iov_iter *to, + int noblock) { struct sk_buff *skb; - ssize_t ret = 0; - int peeked, err, off = 0; + ssize_t ret; + int err; tun_debug(KERN_INFO, tun, "tun_do_read\n"); - if (!len) - return ret; + if (!iov_iter_count(to)) + return 0; - if (tun->dev->reg_state != NETREG_REGISTERED) - return -EIO; + /* Read frames from ring */ + skb = tun_ring_recv(tfile, noblock, &err); + if (!skb) + return err; - /* Read frames from queue */ - skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, - &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); + ret = tun_put_user(tun, tfile, skb, to); + if (unlikely(ret < 0)) kfree_skb(skb); - } else - ret = err; + else + consume_skb(skb); return ret; } -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = __tun_get(tfile); - ssize_t len, ret; + ssize_t len = iov_iter_count(to), ret; if (!tun) return -EBADFD; - len = iov_length(iv, count); - if (len < 0) { - ret = -EINVAL; - goto out; - } - - ret = tun_do_read(tun, tfile, iv, len, - file->f_flags & O_NONBLOCK); + ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; -out: tun_put(tun); return ret; } @@ -1404,6 +1558,7 @@ static void tun_free_netdev(struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); + free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); free_netdev(dev); @@ -1418,6 +1573,8 @@ static void tun_setup(struct net_device *dev) dev->ethtool_ops = &tun_ethtool_ops; dev->destructor = tun_free_netdev; + /* We prefer our own queue length */ + dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap @@ -1443,7 +1600,7 @@ static void tun_sock_write_space(struct sock *sk) if (!sock_writeable(sk)) return; - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); @@ -1455,8 +1612,7 @@ static void tun_sock_write_space(struct sock *sk) kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } -static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -1464,14 +1620,14 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, if (!tun) return -EBADFD; - ret = tun_get_user(tun, tfile, m->msg_control, m->msg_iov, total_len, - m->msg_iovlen, m->msg_flags & MSG_DONTWAIT); + + ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, + m->msg_flags & MSG_DONTWAIT); tun_put(tun); return ret; } -static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len, +static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -1490,9 +1646,8 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, - flags & MSG_DONTWAIT); - if (ret > total_len) { + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT); + if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } @@ -1501,54 +1656,38 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, return ret; } -static int tun_release(struct socket *sock) +static int tun_peek_len(struct socket *sock) { - if (sock->sk) - sock_put(sock->sk); - return 0; + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + struct tun_struct *tun; + int ret = 0; + + tun = __tun_get(tfile); + if (!tun) + return 0; + + ret = skb_array_peek_len(&tfile->tx_array); + tun_put(tun); + + return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { + .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, - .release = tun_release, }; static struct proto tun_proto = { - .name = "bf_tun", + .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { - int flags = 0; - - if (tun->flags & TUN_TUN_DEV) - flags |= IFF_TUN; - else - flags |= IFF_TAP; - - if (tun->flags & TUN_NO_PI) - flags |= IFF_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (tun->flags & TUN_ONE_QUEUE) - flags |= IFF_ONE_QUEUE; - - if (tun->flags & TUN_VNET_HDR) - flags |= IFF_VNET_HDR; - - if (tun->flags & TUN_TAP_MQ) - flags |= IFF_MULTI_QUEUE; - - if (tun->flags & TUN_PERSIST) - flags |= IFF_PERSIST; - - return flags; + return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr, @@ -1582,6 +1721,17 @@ static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL); static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL); static DEVICE_ATTR(group, 0444, tun_show_group, NULL); +static struct attribute *tun_dev_attrs[] = { + &dev_attr_tun_flags.attr, + &dev_attr_owner.attr, + &dev_attr_group.attr, + NULL +}; + +static const struct attribute_group tun_attr_group = { + .attrs = tun_dev_attrs +}; + static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; @@ -1604,7 +1754,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != - !!(tun->flags & TUN_TAP_MQ)) + !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) @@ -1617,7 +1767,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - if (tun->flags & TUN_TAP_MQ && + if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. @@ -1640,11 +1790,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ - flags |= TUN_TUN_DEV; + flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ - flags |= TUN_TAP_DEV; + flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; @@ -1653,14 +1803,20 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, - tun_setup, queues, queues); + NET_NAME_UNKNOWN, tun_setup, queues, + queues); if (!dev) return -ENOMEM; - +#if 0 + err = dev_get_valid_name(net, dev, name); + if (err < 0) + goto err_free_dev; +#endif dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; + dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; @@ -1668,14 +1824,21 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); + tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; + tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); + if (!tun->pcpu_stats) { + err = -ENOMEM; + goto err_free_dev; + } + spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) - goto err_free_dev; + goto err_free_stat; tun_net_init(dev); tun_flow_init(tun); @@ -1683,7 +1846,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features; + dev->features = dev->hw_features | NETIF_F_LLTX; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -1696,39 +1859,14 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err = register_netdevice(tun->dev); if (err < 0) goto err_detach; - - if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || - device_create_file(&tun->dev->dev, &dev_attr_owner) || - device_create_file(&tun->dev->dev, &dev_attr_group)) - pr_err("Failed to create tun sysfs files\n"); } netif_carrier_on(tun->dev); tun_debug(KERN_INFO, tun, "tun_set_iff\n"); - if (ifr->ifr_flags & IFF_NO_PI) - tun->flags |= TUN_NO_PI; - else - tun->flags &= ~TUN_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (ifr->ifr_flags & IFF_ONE_QUEUE) - tun->flags |= TUN_ONE_QUEUE; - else - tun->flags &= ~TUN_ONE_QUEUE; - - if (ifr->ifr_flags & IFF_VNET_HDR) - tun->flags |= TUN_VNET_HDR; - else - tun->flags &= ~TUN_VNET_HDR; - - if (ifr->ifr_flags & IFF_MULTI_QUEUE) - tun->flags |= TUN_TAP_MQ; - else - tun->flags &= ~TUN_TAP_MQ; + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); /* Make sure persistent devices do not get stuck in * xoff state. @@ -1744,6 +1882,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err_free_flow: tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); +err_free_stat: + free_percpu(tun->pcpu_stats); err_free_dev: free_netdev(dev); return err; @@ -1806,7 +1946,9 @@ static void tun_detach_filter(struct tun_struct *tun, int n) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - __sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + sk_detach_filter(tfile->socket.sk); + release_sock(tfile->socket.sk); } tun->filter_attached = false; @@ -1819,8 +1961,9 @@ static int tun_attach_filter(struct tun_struct *tun) for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk, - lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); + release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; @@ -1862,7 +2005,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); - if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) + if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); @@ -1886,6 +2029,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int sndbuf; int vnet_hdr_sz; unsigned int ifindex; + int le; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1897,9 +2041,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on - * TUNSETIFF. */ - return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | - IFF_VNET_HDR | IFF_MULTI_QUEUE, + * TUNSETIFF. + */ + return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) return tun_set_queue(file, &ifr); @@ -1911,7 +2055,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNSETIFF && !tun) { ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ret = tun_set_iff(tfile->net, file, &ifr); + ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr); if (ret) goto unlock; @@ -1966,12 +2110,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ - if (arg && !(tun->flags & TUN_PERSIST)) { - tun->flags |= TUN_PERSIST; + if (arg && !(tun->flags & IFF_PERSIST)) { + tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); } - if (!arg && (tun->flags & TUN_PERSIST)) { - tun->flags &= ~TUN_PERSIST; + if (!arg && (tun->flags & IFF_PERSIST)) { + tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); } @@ -2029,7 +2173,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; @@ -2061,6 +2205,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; break; } + if (sndbuf <= 0) { + ret = -EINVAL; + break; + } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); @@ -2085,10 +2233,35 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun->vnet_hdr_sz = vnet_hdr_sz; break; + case TUNGETVNETLE: + le = !!(tun->flags & TUN_VNET_LE); + if (put_user(le, (int __user *)argp)) + ret = -EFAULT; + break; + + case TUNSETVNETLE: + if (get_user(le, (int __user *)argp)) { + ret = -EFAULT; + break; + } + if (le) + tun->flags |= TUN_VNET_LE; + else + tun->flags &= ~TUN_VNET_LE; + break; + + case TUNGETVNETBE: + ret = tun_get_vnet_be(tun, argp); + break; + + case TUNSETVNETBE: + ret = tun_set_vnet_be(tun, argp); + break; + case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) @@ -2100,7 +2273,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); @@ -2108,7 +2281,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNGETFILTER: ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) @@ -2172,9 +2345,7 @@ static int tun_chr_fasync(int fd, struct file *file, int on) goto out; if (on) { - ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0); - if (ret) - goto out; + __f_setown(file, task_pid(current), PIDTYPE_PID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; @@ -2185,16 +2356,16 @@ static int tun_chr_fasync(int fd, struct file *file, int on) static int tun_chr_open(struct inode *inode, struct file * file) { + struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; DBG1(KERN_INFO, "tunX: tun_chr_open\n"); - tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, - &tun_proto); + tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &tun_proto, 0); if (!tfile) return -ENOMEM; RCU_INIT_POINTER(tfile->tun, NULL); - tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; tfile->ifindex = 0; @@ -2205,33 +2376,31 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->socket.ops = &tun_socket_ops; sock_init_data(&tfile->socket, &tfile->sk); - sk_change_net(&tfile->sk, tfile->net); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; - set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; - struct net *net = tfile->net; tun_detach(tfile, true); - put_net(net); return 0; } #ifdef CONFIG_PROC_FS -static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f) +static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f) { struct tun_struct *tun; struct ifreq ifr; @@ -2247,17 +2416,15 @@ static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f) if (tun) tun_put(tun); - return seq_printf(m, "iff:\t%s\n", ifr.ifr_name); + seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .llseek = no_llseek, - .read = do_sync_read, - .aio_read = tun_chr_aio_read, - .write = do_sync_write, - .aio_write = tun_chr_aio_write, + .read_iter = tun_chr_read_iter, + .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT @@ -2272,7 +2439,7 @@ static const struct file_operations tun_fops = { }; static struct miscdevice tun_miscdev = { - .minor = (TUN_MINOR1), + .minor = TUN_MINOR1, .name = "bf_tun", .nodename = "net/bf_tun", .fops = &tun_fops, @@ -2303,10 +2470,10 @@ static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info strlcpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: - strlcpy(info->bus_info, "bf_tun", sizeof(info->bus_info)); + case IFF_TUN: + strlcpy(info->bus_info, "tun", sizeof(info->bus_info)); break; - case TUN_TAP_DEV: + case IFF_TAP: strlcpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } @@ -2339,6 +2506,56 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; +static int tun_queue_resize(struct tun_struct *tun) +{ + struct net_device *dev = tun->dev; + struct tun_file *tfile; + struct skb_array **arrays; + int n = tun->numqueues + tun->numdisabled; + int ret, i; + + arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL); + if (!arrays) + return -ENOMEM; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + arrays[i] = &tfile->tx_array; + } + list_for_each_entry(tfile, &tun->disabled, next) + arrays[i++] = &tfile->tx_array; + + ret = skb_array_resize_multiple(arrays, n, + dev->tx_queue_len, GFP_KERNEL); + + kfree(arrays); + return ret; +} + +static int tun_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct tun_struct *tun = netdev_priv(dev); + + if (dev->rtnl_link_ops != &tun_link_ops) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGE_TX_QUEUE_LEN: + if (tun_queue_resize(tun)) + return NOTIFY_BAD; + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block tun_notifier_block __read_mostly = { + .notifier_call = tun_device_event, +}; static int __init tun_init(void) { @@ -2355,9 +2572,11 @@ static int __init tun_init(void) ret = misc_register(&tun_miscdev); if (ret) { - pr_err("Can't register misc device %d\n", (TUN_MINOR1)); + pr_err("Can't register misc device %d\n", TUN_MINOR1); goto err_misc; } + + register_netdevice_notifier(&tun_notifier_block); return 0; err_misc: rtnl_link_unregister(&tun_link_ops); @@ -2369,13 +2588,14 @@ static void tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); + unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ -struct socket *bf_tun_get_socket(struct file *file) +struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) @@ -2385,7 +2605,7 @@ struct socket *bf_tun_get_socket(struct file *file) return ERR_PTR(-EBADFD); return &tfile->socket; } -EXPORT_SYMBOL_GPL(bf_tun_get_socket); +EXPORT_SYMBOL_GPL(tun_get_socket); module_init(tun_init); module_exit(tun_cleanup); diff --git a/platform/barefoot/sonic-platform-modules-wnc-osw1800/modules/bf_tun.c b/platform/barefoot/sonic-platform-modules-wnc-osw1800/modules/bf_tun.c index a1ba7047baaa..abb906a24618 100644 --- a/platform/barefoot/sonic-platform-modules-wnc-osw1800/modules/bf_tun.c +++ b/platform/barefoot/sonic-platform-modules-wnc-osw1800/modules/bf_tun.c @@ -65,20 +65,21 @@ #include #include #include -#include #include #include #include #include #include +#include +#include #include +#define TUN_MINOR1 201 + /* Uncomment to enable debugging */ /* #define TUN_DEBUG 1 */ -#define TUN_MINOR1 201 - #ifdef TUN_DEBUG static int debug; @@ -105,6 +106,18 @@ do { \ } while (0) #endif +/* TUN device flags */ + +/* IFF_ATTACH_QUEUE is never stored in device flags, + * overload it to mean fasync when stored there. + */ +#define TUN_FASYNC IFF_ATTACH_QUEUE +/* High bits in flags field are unused. */ +#define TUN_VNET_LE 0x80000000 +#define TUN_VNET_BE 0x40000000 + +#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ + IFF_MULTI_QUEUE) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -114,14 +127,24 @@ struct tap_filter { unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; -/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues allocated for - * the netdevice to be fit in one page. So we can make sure the success of - * memory allocation. TODO: increase the limit. */ -#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES +/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal + * to max number of VCPUs in guest. */ +#define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) +struct tun_pcpu_stats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 rx_dropped; + u32 tx_dropped; + u32 rx_frame_errors; +}; + /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and @@ -138,7 +161,6 @@ struct tun_file { struct socket socket; struct socket_wq wq; struct tun_struct __rcu *tun; - struct net *net; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; @@ -148,6 +170,7 @@ struct tun_file { }; struct list_head next; struct tun_struct *detached; + struct skb_array tx_array; }; struct tun_flow_entry { @@ -179,6 +202,7 @@ struct tun_struct { #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6|NETIF_F_UFO) + int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; @@ -196,8 +220,73 @@ struct tun_struct { struct list_head disabled; void *security; u32 flow_count; + struct tun_pcpu_stats __percpu *pcpu_stats; }; +#ifdef CONFIG_TUN_VNET_CROSS_LE +static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) +{ + return tun->flags & TUN_VNET_BE ? false : + virtio_legacy_is_little_endian(); +} + +static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) +{ + int be = !!(tun->flags & TUN_VNET_BE); + + if (put_user(be, argp)) + return -EFAULT; + + return 0; +} + +static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) +{ + int be; + + if (get_user(be, argp)) + return -EFAULT; + + if (be) + tun->flags |= TUN_VNET_BE; + else + tun->flags &= ~TUN_VNET_BE; + + return 0; +} +#else +static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) +{ + return virtio_legacy_is_little_endian(); +} + +static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) +{ + return -EINVAL; +} + +static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) +{ + return -EINVAL; +} +#endif /* CONFIG_TUN_VNET_CROSS_LE */ + +static inline bool tun_is_little_endian(struct tun_struct *tun) +{ + return tun->flags & TUN_VNET_LE || + tun_legacy_is_little_endian(tun); +} + +static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) +{ + return __virtio16_to_cpu(tun_is_little_endian(tun), val); +} + +static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) +{ + return __cpu_to_virtio16(tun_is_little_endian(tun), val); +} + static inline u32 tun_hashfn(u32 rxhash) { return rxhash & 0x3ff; @@ -238,7 +327,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); - sock_rps_reset_flow_hash(e->rps_rxhash); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; @@ -355,10 +443,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { - if (unlikely(e->rps_rxhash != hash)) { - sock_rps_reset_flow_hash(e->rps_rxhash); + if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; - } } /* We try to identify a flow through its rxhash first. The reason that @@ -433,10 +519,22 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) static void tun_queue_purge(struct tun_file *tfile) { - skb_queue_purge(&tfile->sk.sk_receive_queue); + struct sk_buff *skb; + + while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) + kfree_skb(skb); + skb_queue_purge(&tfile->sk.sk_error_queue); } +static void tun_cleanup_tx_array(struct tun_file *tfile) +{ + if (tfile->tx_array.ring.queue) { + skb_array_cleanup(&tfile->tx_array); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + } +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -474,14 +572,12 @@ static void __tun_detach(struct tun_file *tfile, bool clean) if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); - if (!(tun->flags & TUN_PERSIST) && + if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } - - BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, - &tfile->socket.flags)); - sk_release_kernel(&tfile->sk); + tun_cleanup_tx_array(tfile); + sock_put(&tfile->sk); } } @@ -501,11 +597,13 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); + tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { + tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } @@ -517,21 +615,24 @@ static void tun_detach_all(struct net_device *dev) /* Drop read queue */ tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } BUG_ON(tun->numdisabled != 0); - if (tun->flags & TUN_PERSIST) + if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; + struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); @@ -543,7 +644,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte goto out; err = -EBUSY; - if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1) + if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; @@ -555,12 +656,21 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { - err = __sk_attach_filter(&tun->fprog, tfile->socket.sk, - lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + err = sk_attach_filter(&tun->fprog, tfile->socket.sk); + release_sock(tfile->socket.sk); if (!err) goto out; } + + if (!tfile->detached && + skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) { + err = -ENOMEM; + goto out; + } + tfile->queue_index = tun->numqueues; + tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; @@ -632,14 +742,9 @@ static int update_filter(struct tap_filter *filter, void __user *arg) } alen = ETH_ALEN * uf.count; - addr = kmalloc(alen, GFP_KERNEL); - if (!addr) - return -ENOMEM; - - if (copy_from_user(addr, arg + sizeof(uf), alen)) { - err = -EFAULT; - goto done; - } + addr = memdup_user(arg + sizeof(uf), alen); + if (IS_ERR(addr)) + return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst @@ -659,7 +764,7 @@ static int update_filter(struct tap_filter *filter, void __user *arg) for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ - goto done; + goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } @@ -675,8 +780,7 @@ static int update_filter(struct tap_filter *filter, void __user *arg) /* Return the number of exact filters */ err = nexact; - -done: +free_addr: kfree(addr); return err; } @@ -726,7 +830,18 @@ static void tun_net_uninit(struct net_device *dev) /* Net device open. */ static int tun_net_open(struct net_device *dev) { + struct tun_struct *tun = netdev_priv(dev); + int i; + netif_tx_start_all_queues(dev); + + for (i = 0; i < tun->numqueues; i++) { + struct tun_file *tfile; + + tfile = rtnl_dereference(tun->tfiles[i]); + tfile->socket.sk->sk_write_space(tfile->socket.sk); + } + return 0; } @@ -753,7 +868,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (txq >= numqueues) goto drop; - if (numqueues == 1) { +#ifdef CONFIG_RPS + if (numqueues == 1 && static_key_false(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ @@ -768,6 +884,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) tun_flow_save_rps_rxhash(e, rxhash); } } +#endif tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); @@ -793,10 +910,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - if (skb->sk) { - sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); - sw_tx_timestamp(skb); - } + skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. @@ -805,8 +919,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); - /* Enqueue packet */ - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); + if (skb_array_produce(&tfile->tx_array, skb)) + goto drop; /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) @@ -817,11 +931,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; drop: - dev->stats.tx_dropped++; + this_cpu_inc(tun->pcpu_stats->tx_dropped); skb_tx_error(skb); kfree_skb(skb); rcu_read_unlock(); - return NETDEV_TX_OK; + return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) @@ -852,16 +966,6 @@ static netdev_features_t tun_net_fix_features(struct net_device *dev, return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } - -static int -tun_change_carrier(struct net_device *dev, bool new_carrier) { - if (new_carrier) - netif_carrier_on(dev); - else - netif_carrier_off(dev); - return 0; -} - #ifdef CONFIG_NET_POLL_CONTROLLER static void tun_poll_controller(struct net_device *dev) { @@ -879,6 +983,63 @@ static void tun_poll_controller(struct net_device *dev) return; } #endif + +static void tun_set_headroom(struct net_device *dev, int new_hr) +{ + struct tun_struct *tun = netdev_priv(dev); + + if (new_hr < NET_SKB_PAD) + new_hr = NET_SKB_PAD; + + tun->align = new_hr; +} + +static struct rtnl_link_stats64 * +tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0; + struct tun_struct *tun = netdev_priv(dev); + struct tun_pcpu_stats *p; + int i; + + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, txpackets, txbytes; + unsigned int start; + + p = per_cpu_ptr(tun->pcpu_stats, i); + do { + start = u64_stats_fetch_begin(&p->syncp); + rxpackets = p->rx_packets; + rxbytes = p->rx_bytes; + txpackets = p->tx_packets; + txbytes = p->tx_bytes; + } while (u64_stats_fetch_retry(&p->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->tx_packets += txpackets; + stats->tx_bytes += txbytes; + + /* u32 counters */ + rx_dropped += p->rx_dropped; + rx_frame_errors += p->rx_frame_errors; + tx_dropped += p->tx_dropped; + } + stats->rx_dropped = rx_dropped; + stats->rx_frame_errors = rx_frame_errors; + stats->tx_dropped = tx_dropped; + return stats; +} + +static int +tun_change_carrier(struct net_device *dev, bool new_carrier) { + if (new_carrier) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + return 0; +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -890,6 +1051,9 @@ static const struct net_device_ops tun_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif + .ndo_set_rx_headroom = tun_set_headroom, + .ndo_get_stats64 = tun_net_get_stats64, + .ndo_change_carrier = tun_change_carrier, }; static const struct net_device_ops tap_netdev_ops = { @@ -906,7 +1070,9 @@ static const struct net_device_ops tap_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif - .ndo_change_carrier = tun_change_carrier, + .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = tun_set_headroom, + .ndo_get_stats64 = tun_net_get_stats64, }; static void tun_flow_init(struct tun_struct *tun) @@ -934,7 +1100,7 @@ static void tun_net_init(struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: + case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; /* Point-to-Point TUN Device */ @@ -945,10 +1111,9 @@ static void tun_net_init(struct net_device *dev) /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->tx_queue_len = TUN_READQ_SIZE*2; /* We prefer our own queue length */ break; - case TUN_TAP_DEV: + case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); @@ -957,7 +1122,6 @@ static void tun_net_init(struct net_device *dev) eth_hw_addr_random(dev); - dev->tx_queue_len = TUN_READQ_SIZE*2; /* We prefer our own queue length */ break; } } @@ -981,12 +1145,13 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!skb_array_empty(&tfile->tx_array)) mask |= POLLIN | POLLRDNORM; - if (sock_writeable(sk) || - (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) && - sock_writeable(sk))) + if (tun->dev->flags & IFF_UP && + (sock_writeable(sk) || + (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && + sock_writeable(sk)))) mask |= POLLOUT | POLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) @@ -1025,121 +1190,127 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, - void *msg_control, const struct iovec *iv, - size_t total_len, size_t count, int noblock) + void *msg_control, struct iov_iter *from, + int noblock) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; - size_t len = total_len, align = NET_SKB_PAD, linear; + size_t total_len = iov_iter_count(from); + size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; + struct tun_pcpu_stats *stats; int good_linear; - int offset = 0; int copylen; bool zerocopy = false; int err; u32 rxhash; + ssize_t n; + + if (!(tun->dev->flags & IFF_UP)) + return -EIO; - if (!(tun->flags & TUN_NO_PI)) { + if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); - if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) + n = copy_from_iter(&pi, sizeof(pi), from); + if (n != sizeof(pi)) return -EFAULT; - offset += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { - if (len < tun->vnet_hdr_sz) + if (tun->flags & IFF_VNET_HDR) { + int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + + if (len < vnet_hdr_sz) return -EINVAL; - len -= tun->vnet_hdr_sz; + len -= vnet_hdr_sz; - if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) + n = copy_from_iter(&gso, sizeof(gso), from); + if (n != sizeof(gso)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - gso.csum_start + gso.csum_offset + 2 > gso.hdr_len) - gso.hdr_len = gso.csum_start + gso.csum_offset + 2; + tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) + gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); - if (gso.hdr_len > len) + if (tun16_to_cpu(tun, gso.hdr_len) > len) return -EINVAL; - offset += tun->vnet_hdr_sz; + iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { + if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || - (gso.hdr_len && gso.hdr_len < ETH_HLEN))) + (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { + struct iov_iter i = *from; + /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ - copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN; + copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; - if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS) + iov_iter_advance(&i, copylen); + if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; - if (gso.hdr_len > good_linear) + if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) linear = good_linear; else - linear = gso.hdr_len; + linear = tun16_to_cpu(tun, gso.hdr_len); } skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); return PTR_ERR(skb); } if (zerocopy) - err = zerocopy_sg_from_iovec(skb, iv, offset, count); - else { - err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); - if (!err && msg_control) { - struct ubuf_info *uarg = msg_control; - uarg->callback(uarg, false); - } - } + err = zerocopy_sg_from_iter(skb, from); + else + err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); return -EFAULT; } - if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (!skb_partial_csum_set(skb, gso.csum_start, - gso.csum_offset)) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } + err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun)); + if (err) { + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); + kfree_skb(skb); + return -EINVAL; } switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: - if (tun->flags & TUN_NO_PI) { - switch (skb->data[0] & 0xf0) { - case 0x40: + case IFF_TUN: + if (tun->flags & IFF_NO_PI) { + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: pi.proto = htons(ETH_P_IP); break; - case 0x60: + case 6: pi.proto = htons(ETH_P_IPV6); break; default: - tun->dev->stats.rx_dropped++; + this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); return -EINVAL; } @@ -1149,69 +1320,39 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb->protocol = pi.proto; skb->dev = tun->dev; break; - case TUN_TAP_DEV: + case IFF_TAP: skb->protocol = eth_type_trans(skb, tun->dev); break; } - skb_reset_network_header(skb); - - if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - pr_debug("GSO!\n"); - switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV6: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; - break; - case VIRTIO_NET_HDR_GSO_UDP: - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - if (skb->protocol == htons(ETH_P_IPV6)) - ipv6_proxy_select_ident(skb); - break; - default: - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } - - if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - skb_shinfo(skb)->gso_size = gso.gso_size; - if (skb_shinfo(skb)->gso_size == 0) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; - } - - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; - } - /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + } else if (msg_control) { + struct ubuf_info *uarg = msg_control; + uarg->callback(uarg, false); } + skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); rxhash = skb_get_hash(skb); netif_rx_ni(skb); - tun->dev->stats.rx_packets++; - tun->dev->stats.rx_bytes += len; + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(stats); tun_flow_update(tun, rxhash, tfile); return total_len; } -static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_struct *tun = tun_get(file); @@ -1221,10 +1362,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, if (!tun) return -EBADFD; - tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); - - result = tun_get_user(tun, tfile, NULL, iv, iov_length(iv, count), - count, file->f_flags & O_NONBLOCK); + result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK); tun_put(tun); return result; @@ -1234,167 +1372,183 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + struct tun_pcpu_stats *stats; + ssize_t total; + int vlan_offset = 0; int vlan_hlen = 0; + int vnet_hdr_sz = 0; - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; - if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (tun->flags & IFF_VNET_HDR) + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + + total = skb->len + vlan_hlen + vnet_hdr_sz; + + if (!(tun->flags & IFF_NO_PI)) { + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len) { + total += sizeof(pi); + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; - total += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { + if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= tun->vnet_hdr_sz) < 0) + int ret; + + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; - if (skb_is_gso(skb)) { + ret = virtio_net_hdr_from_skb(skb, &gso, + tun_is_little_endian(tun), true); + if (ret) { struct skb_shared_info *sinfo = skb_shinfo(skb); + pr_err("unexpected GSO type: " + "0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + WARN_ON_ONCE(1); + return -EINVAL; + } - /* This is a hint as to how much should be linear. */ - gso.hdr_len = skb_headlen(skb); - gso.gso_size = sinfo->gso_size; - if (sinfo->gso_type & SKB_GSO_TCPV4) - gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - gso.gso_type = VIRTIO_NET_HDR_GSO_UDP; - else { - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, gso.gso_size, - gso.hdr_len); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)gso.hdr_len, 64), true); - WARN_ON_ONCE(1); - return -EINVAL; - } - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - gso.gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - gso.csum_start = skb_checksum_start_offset(skb) + - vlan_hlen; - gso.csum_offset = skb->csum_offset; - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ - - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; - total += tun->vnet_hdr_sz; + + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; - veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: - tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + /* caller is in process context, */ + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += skb->len + vlan_hlen; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(tun->pcpu_stats); return total; } +static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, + int *err) +{ + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb = NULL; + int error = 0; + + skb = skb_array_consume(&tfile->tx_array); + if (skb) + goto out; + if (noblock) { + error = -EAGAIN; + goto out; + } + + add_wait_queue(&tfile->wq.wait, &wait); + current->state = TASK_INTERRUPTIBLE; + + while (1) { + skb = skb_array_consume(&tfile->tx_array); + if (skb) + break; + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { + error = -EFAULT; + break; + } + + schedule(); + } + + current->state = TASK_RUNNING; + remove_wait_queue(&tfile->wq.wait, &wait); + +out: + *err = error; + return skb; +} + static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + struct iov_iter *to, + int noblock) { struct sk_buff *skb; - ssize_t ret = 0; - int peeked, err, off = 0; + ssize_t ret; + int err; tun_debug(KERN_INFO, tun, "tun_do_read\n"); - if (!len) - return ret; + if (!iov_iter_count(to)) + return 0; - if (tun->dev->reg_state != NETREG_REGISTERED) - return -EIO; + /* Read frames from ring */ + skb = tun_ring_recv(tfile, noblock, &err); + if (!skb) + return err; - /* Read frames from queue */ - skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, - &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); + ret = tun_put_user(tun, tfile, skb, to); + if (unlikely(ret < 0)) kfree_skb(skb); - } else - ret = err; + else + consume_skb(skb); return ret; } -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = __tun_get(tfile); - ssize_t len, ret; + ssize_t len = iov_iter_count(to), ret; if (!tun) return -EBADFD; - len = iov_length(iv, count); - if (len < 0) { - ret = -EINVAL; - goto out; - } - - ret = tun_do_read(tun, tfile, iv, len, - file->f_flags & O_NONBLOCK); + ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; -out: tun_put(tun); return ret; } @@ -1404,6 +1558,7 @@ static void tun_free_netdev(struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); + free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); free_netdev(dev); @@ -1418,6 +1573,8 @@ static void tun_setup(struct net_device *dev) dev->ethtool_ops = &tun_ethtool_ops; dev->destructor = tun_free_netdev; + /* We prefer our own queue length */ + dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap @@ -1443,7 +1600,7 @@ static void tun_sock_write_space(struct sock *sk) if (!sock_writeable(sk)) return; - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); @@ -1455,8 +1612,7 @@ static void tun_sock_write_space(struct sock *sk) kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } -static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -1464,14 +1620,14 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, if (!tun) return -EBADFD; - ret = tun_get_user(tun, tfile, m->msg_control, m->msg_iov, total_len, - m->msg_iovlen, m->msg_flags & MSG_DONTWAIT); + + ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, + m->msg_flags & MSG_DONTWAIT); tun_put(tun); return ret; } -static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len, +static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -1490,9 +1646,8 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, - flags & MSG_DONTWAIT); - if (ret > total_len) { + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT); + if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } @@ -1501,54 +1656,38 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, return ret; } -static int tun_release(struct socket *sock) +static int tun_peek_len(struct socket *sock) { - if (sock->sk) - sock_put(sock->sk); - return 0; + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + struct tun_struct *tun; + int ret = 0; + + tun = __tun_get(tfile); + if (!tun) + return 0; + + ret = skb_array_peek_len(&tfile->tx_array); + tun_put(tun); + + return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { + .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, - .release = tun_release, }; static struct proto tun_proto = { - .name = "bf_tun", + .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { - int flags = 0; - - if (tun->flags & TUN_TUN_DEV) - flags |= IFF_TUN; - else - flags |= IFF_TAP; - - if (tun->flags & TUN_NO_PI) - flags |= IFF_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (tun->flags & TUN_ONE_QUEUE) - flags |= IFF_ONE_QUEUE; - - if (tun->flags & TUN_VNET_HDR) - flags |= IFF_VNET_HDR; - - if (tun->flags & TUN_TAP_MQ) - flags |= IFF_MULTI_QUEUE; - - if (tun->flags & TUN_PERSIST) - flags |= IFF_PERSIST; - - return flags; + return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr, @@ -1582,6 +1721,17 @@ static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL); static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL); static DEVICE_ATTR(group, 0444, tun_show_group, NULL); +static struct attribute *tun_dev_attrs[] = { + &dev_attr_tun_flags.attr, + &dev_attr_owner.attr, + &dev_attr_group.attr, + NULL +}; + +static const struct attribute_group tun_attr_group = { + .attrs = tun_dev_attrs +}; + static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; @@ -1604,7 +1754,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != - !!(tun->flags & TUN_TAP_MQ)) + !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) @@ -1617,7 +1767,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - if (tun->flags & TUN_TAP_MQ && + if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. @@ -1640,11 +1790,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ - flags |= TUN_TUN_DEV; + flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ - flags |= TUN_TAP_DEV; + flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; @@ -1653,14 +1803,20 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, - tun_setup, queues, queues); + NET_NAME_UNKNOWN, tun_setup, queues, + queues); if (!dev) return -ENOMEM; - +#if 0 + err = dev_get_valid_name(net, dev, name); + if (err < 0) + goto err_free_dev; +#endif dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; + dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; @@ -1668,14 +1824,21 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); + tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; + tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); + if (!tun->pcpu_stats) { + err = -ENOMEM; + goto err_free_dev; + } + spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) - goto err_free_dev; + goto err_free_stat; tun_net_init(dev); tun_flow_init(tun); @@ -1683,7 +1846,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features; + dev->features = dev->hw_features | NETIF_F_LLTX; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -1696,39 +1859,14 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err = register_netdevice(tun->dev); if (err < 0) goto err_detach; - - if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || - device_create_file(&tun->dev->dev, &dev_attr_owner) || - device_create_file(&tun->dev->dev, &dev_attr_group)) - pr_err("Failed to create tun sysfs files\n"); } netif_carrier_on(tun->dev); tun_debug(KERN_INFO, tun, "tun_set_iff\n"); - if (ifr->ifr_flags & IFF_NO_PI) - tun->flags |= TUN_NO_PI; - else - tun->flags &= ~TUN_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (ifr->ifr_flags & IFF_ONE_QUEUE) - tun->flags |= TUN_ONE_QUEUE; - else - tun->flags &= ~TUN_ONE_QUEUE; - - if (ifr->ifr_flags & IFF_VNET_HDR) - tun->flags |= TUN_VNET_HDR; - else - tun->flags &= ~TUN_VNET_HDR; - - if (ifr->ifr_flags & IFF_MULTI_QUEUE) - tun->flags |= TUN_TAP_MQ; - else - tun->flags &= ~TUN_TAP_MQ; + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); /* Make sure persistent devices do not get stuck in * xoff state. @@ -1744,6 +1882,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err_free_flow: tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); +err_free_stat: + free_percpu(tun->pcpu_stats); err_free_dev: free_netdev(dev); return err; @@ -1806,7 +1946,9 @@ static void tun_detach_filter(struct tun_struct *tun, int n) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - __sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + sk_detach_filter(tfile->socket.sk); + release_sock(tfile->socket.sk); } tun->filter_attached = false; @@ -1819,8 +1961,9 @@ static int tun_attach_filter(struct tun_struct *tun) for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk, - lockdep_rtnl_is_held()); + lock_sock(tfile->socket.sk); + ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); + release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; @@ -1862,7 +2005,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); - if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) + if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); @@ -1886,6 +2029,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int sndbuf; int vnet_hdr_sz; unsigned int ifindex; + int le; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1897,9 +2041,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on - * TUNSETIFF. */ - return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | - IFF_VNET_HDR | IFF_MULTI_QUEUE, + * TUNSETIFF. + */ + return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) return tun_set_queue(file, &ifr); @@ -1911,7 +2055,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNSETIFF && !tun) { ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ret = tun_set_iff(tfile->net, file, &ifr); + ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr); if (ret) goto unlock; @@ -1966,12 +2110,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ - if (arg && !(tun->flags & TUN_PERSIST)) { - tun->flags |= TUN_PERSIST; + if (arg && !(tun->flags & IFF_PERSIST)) { + tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); } - if (!arg && (tun->flags & TUN_PERSIST)) { - tun->flags &= ~TUN_PERSIST; + if (!arg && (tun->flags & IFF_PERSIST)) { + tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); } @@ -2029,7 +2173,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; @@ -2061,6 +2205,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; break; } + if (sndbuf <= 0) { + ret = -EINVAL; + break; + } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); @@ -2085,10 +2233,35 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun->vnet_hdr_sz = vnet_hdr_sz; break; + case TUNGETVNETLE: + le = !!(tun->flags & TUN_VNET_LE); + if (put_user(le, (int __user *)argp)) + ret = -EFAULT; + break; + + case TUNSETVNETLE: + if (get_user(le, (int __user *)argp)) { + ret = -EFAULT; + break; + } + if (le) + tun->flags |= TUN_VNET_LE; + else + tun->flags &= ~TUN_VNET_LE; + break; + + case TUNGETVNETBE: + ret = tun_get_vnet_be(tun, argp); + break; + + case TUNSETVNETBE: + ret = tun_set_vnet_be(tun, argp); + break; + case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) @@ -2100,7 +2273,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); @@ -2108,7 +2281,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNGETFILTER: ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) @@ -2172,9 +2345,7 @@ static int tun_chr_fasync(int fd, struct file *file, int on) goto out; if (on) { - ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0); - if (ret) - goto out; + __f_setown(file, task_pid(current), PIDTYPE_PID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; @@ -2185,16 +2356,16 @@ static int tun_chr_fasync(int fd, struct file *file, int on) static int tun_chr_open(struct inode *inode, struct file * file) { + struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; DBG1(KERN_INFO, "tunX: tun_chr_open\n"); - tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, - &tun_proto); + tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &tun_proto, 0); if (!tfile) return -ENOMEM; RCU_INIT_POINTER(tfile->tun, NULL); - tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; tfile->ifindex = 0; @@ -2205,33 +2376,31 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->socket.ops = &tun_socket_ops; sock_init_data(&tfile->socket, &tfile->sk); - sk_change_net(&tfile->sk, tfile->net); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; - set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; - struct net *net = tfile->net; tun_detach(tfile, true); - put_net(net); return 0; } #ifdef CONFIG_PROC_FS -static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f) +static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f) { struct tun_struct *tun; struct ifreq ifr; @@ -2247,17 +2416,15 @@ static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f) if (tun) tun_put(tun); - return seq_printf(m, "iff:\t%s\n", ifr.ifr_name); + seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .llseek = no_llseek, - .read = do_sync_read, - .aio_read = tun_chr_aio_read, - .write = do_sync_write, - .aio_write = tun_chr_aio_write, + .read_iter = tun_chr_read_iter, + .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT @@ -2272,7 +2439,7 @@ static const struct file_operations tun_fops = { }; static struct miscdevice tun_miscdev = { - .minor = (TUN_MINOR1), + .minor = TUN_MINOR1, .name = "bf_tun", .nodename = "net/bf_tun", .fops = &tun_fops, @@ -2303,10 +2470,10 @@ static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info strlcpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { - case TUN_TUN_DEV: - strlcpy(info->bus_info, "bf_tun", sizeof(info->bus_info)); + case IFF_TUN: + strlcpy(info->bus_info, "tun", sizeof(info->bus_info)); break; - case TUN_TAP_DEV: + case IFF_TAP: strlcpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } @@ -2339,6 +2506,56 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; +static int tun_queue_resize(struct tun_struct *tun) +{ + struct net_device *dev = tun->dev; + struct tun_file *tfile; + struct skb_array **arrays; + int n = tun->numqueues + tun->numdisabled; + int ret, i; + + arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL); + if (!arrays) + return -ENOMEM; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + arrays[i] = &tfile->tx_array; + } + list_for_each_entry(tfile, &tun->disabled, next) + arrays[i++] = &tfile->tx_array; + + ret = skb_array_resize_multiple(arrays, n, + dev->tx_queue_len, GFP_KERNEL); + + kfree(arrays); + return ret; +} + +static int tun_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct tun_struct *tun = netdev_priv(dev); + + if (dev->rtnl_link_ops != &tun_link_ops) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGE_TX_QUEUE_LEN: + if (tun_queue_resize(tun)) + return NOTIFY_BAD; + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block tun_notifier_block __read_mostly = { + .notifier_call = tun_device_event, +}; static int __init tun_init(void) { @@ -2355,9 +2572,11 @@ static int __init tun_init(void) ret = misc_register(&tun_miscdev); if (ret) { - pr_err("Can't register misc device %d\n", (TUN_MINOR1)); + pr_err("Can't register misc device %d\n", TUN_MINOR1); goto err_misc; } + + register_netdevice_notifier(&tun_notifier_block); return 0; err_misc: rtnl_link_unregister(&tun_link_ops); @@ -2369,13 +2588,14 @@ static void tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); + unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ -struct socket *bf_tun_get_socket(struct file *file) +struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) @@ -2385,7 +2605,7 @@ struct socket *bf_tun_get_socket(struct file *file) return ERR_PTR(-EBADFD); return &tfile->socket; } -EXPORT_SYMBOL_GPL(bf_tun_get_socket); +EXPORT_SYMBOL_GPL(tun_get_socket); module_init(tun_init); module_exit(tun_cleanup);