diff -Naur linux-3.10.30.org/drivers/net/imq.c linux-3.10.30/drivers/net/imq.c --- linux-3.10.30.org/drivers/net/imq.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/drivers/net/imq.c 2014-02-14 20:29:05.379402305 +0100 @@ -0,0 +1,1001 @@ +/* + * Pseudo-driver for the intermediate queue device. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Patrick McHardy, + * + * The first version was written by Martin Devera, + * + * Credits: Jan Rafaj + * - Update patch to 2.4.21 + * Sebastian Strollo + * - Fix "Dead-loop on netdevice imq"-issue + * Marcel Sebek + * - Update to 2.6.2-rc1 + * + * After some time of inactivity there is a group taking care + * of IMQ again: http://www.linuximq.net + * + * + * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 + * including the following changes: + * + * - Correction of ipv6 support "+"s issue (Hasso Tepper) + * - Correction of imq_init_devs() issue that resulted in + * kernel OOPS unloading IMQ as module (Norbert Buchmuller) + * - Addition of functionality to choose number of IMQ devices + * during kernel config (Andre Correa) + * - Addition of functionality to choose how IMQ hooks on + * PRE and POSTROUTING (after or before NAT) (Andre Correa) + * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) + * + * + * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were + * released with almost no problems. 2.6.14-x was released + * with some important changes: nfcache was removed; After + * some weeks of trouble we figured out that some IMQ fields + * in skb were missing in skbuff.c - skb_clone and copy_skb_header. + * These functions are correctly patched by this new patch version. + * + * Thanks for all who helped to figure out all the problems with + * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, + * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully + * I didn't forget anybody). I apologize again for my lack of time. + * + * + * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead + * of qdisc_restart() and moved qdisc_run() to tasklet to avoid + * recursive locking. New initialization routines to fix 'rmmod' not + * working anymore. Used code from ifb.c. (Jussi Kivilinna) + * + * 2008/08/06 - 2.6.26 - (JK) + * - Replaced tasklet with 'netif_schedule()'. + * - Cleaned up and added comments for imq_nf_queue(). + * + * 2009/04/12 + * - Add skb_save_cb/skb_restore_cb helper functions for backuping + * control buffer. This is needed because qdisc-layer on kernels + * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) + * - Add better locking for IMQ device. Hopefully this will solve + * SMP issues. (Jussi Kivilinna) + * - Port to 2.6.27 + * - Port to 2.6.28 + * - Port to 2.6.29 + fix rmmod not working + * + * 2009/04/20 - (Jussi Kivilinna) + * - Use netdevice feature flags to avoid extra packet handling + * by core networking layer and possibly increase performance. + * + * 2009/09/26 - (Jussi Kivilinna) + * - Add imq_nf_reinject_lockless to fix deadlock with + * imq_nf_queue/imq_nf_reinject. + * + * 2009/12/08 - (Jussi Kivilinna) + * - Port to 2.6.32 + * - Add check for skb->nf_queue_entry==NULL in imq_dev_xmit() + * - Also add better error checking for skb->nf_queue_entry usage + * + * 2010/02/25 - (Jussi Kivilinna) + * - Port to 2.6.33 + * + * 2010/08/15 - (Jussi Kivilinna) + * - Port to 2.6.35 + * - Simplify hook registration by using nf_register_hooks. + * - nf_reinject doesn't need spinlock around it, therefore remove + * imq_nf_reinject function. Other nf_reinject users protect + * their own data with spinlock. With IMQ however all data is + * needed is stored per skbuff, so no locking is needed. + * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of + * NF_QUEUE, this allows working coexistance of IMQ and other + * NF_QUEUE users. + * - Make IMQ multi-queue. Number of IMQ device queues can be + * increased with 'numqueues' module parameters. Default number + * of queues is 1, in other words by default IMQ works as + * single-queue device. Multi-queue selection is based on + * IFB multi-queue patch by Changli Gao . + * + * 2011/03/18 - (Jussi Kivilinna) + * - Port to 2.6.38 + * + * 2011/07/12 - (syoder89@gmail.com) + * - Crash fix that happens when the receiving interface has more + * than one queue (add missing skb_set_queue_mapping in + * imq_select_queue). + * + * 2011/07/26 - (Jussi Kivilinna) + * - Add queue mapping checks for packets exiting IMQ. + * - Port to 3.0 + * + * 2011/08/16 - (Jussi Kivilinna) + * - Clear IFF_TX_SKB_SHARING flag that was added for linux 3.0.2 + * + * 2011/11/03 - Germano Michel + * - Fix IMQ for net namespaces + * + * 2011/11/04 - Jussi Kivilinna + * - Port to 3.1 + * - Clean-up, move 'get imq device pointer by imqX name' to + * separate function from imq_nf_queue(). + * + * 2012/01/05 - Jussi Kivilinna + * - Port to 3.2 + * + * 2012/03/19 - Jussi Kivilinna + * - Port to 3.3 + * + * 2012/12/12 - Jussi Kivilinna + * - Port to 3.7 + * - Fix checkpatch.pl warnings + * + * 2013/09/10 - Jussi Kivilinna + * - Fixed GSO handling for 3.10, see imq_nf_queue() for comments. + * - Don't copy skb->cb_next when copying or cloning skbuffs. + * + * Also, many thanks to pablo Sebastian Greco for making the initial + * patch and to those who helped the testing. + * + * More info at: http://www.linuximq.net/ (Andre Correa) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + #include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num); + +static nf_hookfn imq_nf_hook; + +static struct nf_hook_ops imq_ops[] = { + { + /* imq_ingress_ipv4 */ + .hook = imq_nf_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) + .priority = NF_IP_PRI_MANGLE + 1, +#else + .priority = NF_IP_PRI_NAT_DST + 1, +#endif + }, + { + /* imq_egress_ipv4 */ + .hook = imq_nf_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, +#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) + .priority = NF_IP_PRI_LAST, +#else + .priority = NF_IP_PRI_NAT_SRC - 1, +#endif + }, +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + { + /* imq_ingress_ipv6 */ + .hook = imq_nf_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_PRE_ROUTING, +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) + .priority = NF_IP6_PRI_MANGLE + 1, +#else + .priority = NF_IP6_PRI_NAT_DST + 1, +#endif + }, + { + /* imq_egress_ipv6 */ + .hook = imq_nf_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, +#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) + .priority = NF_IP6_PRI_LAST, +#else + .priority = NF_IP6_PRI_NAT_SRC - 1, +#endif + }, +#endif +}; + +#if defined(CONFIG_IMQ_NUM_DEVS) +static int numdevs = CONFIG_IMQ_NUM_DEVS; +#else +static int numdevs = IMQ_MAX_DEVS; +#endif + +static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; + +#define IMQ_MAX_QUEUES 32 +static int numqueues = 1; +static u32 imq_hashrnd; + +static inline __be16 pppoe_proto(const struct sk_buff *skb) +{ + return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); +} + +static u16 imq_hash(struct net_device *dev, struct sk_buff *skb) +{ + unsigned int pull_len; + u16 protocol = skb->protocol; + u32 addr1, addr2; + u32 hash, ihl = 0; + union { + u16 in16[2]; + u32 in32; + } ports; + u8 ip_proto; + + pull_len = 0; + +recheck: + switch (protocol) { + case htons(ETH_P_8021Q): { + if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL)) + goto other; + + pull_len += VLAN_HLEN; + skb->network_header += VLAN_HLEN; + + protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + goto recheck; + } + + case htons(ETH_P_PPP_SES): { + if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL)) + goto other; + + pull_len += PPPOE_SES_HLEN; + skb->network_header += PPPOE_SES_HLEN; + + protocol = pppoe_proto(skb); + goto recheck; + } + + case htons(ETH_P_IP): { + const struct iphdr *iph = ip_hdr(skb); + + if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) + goto other; + + addr1 = iph->daddr; + addr2 = iph->saddr; + + ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? + iph->protocol : 0; + ihl = ip_hdrlen(skb); + + break; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): { + const struct ipv6hdr *iph = ipv6_hdr(skb); + __be16 fo = 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr)))) + goto other; + + addr1 = iph->daddr.s6_addr32[3]; + addr2 = iph->saddr.s6_addr32[3]; + ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto, + &fo); + if (unlikely(ihl < 0)) + goto other; + + break; + } +#endif + default: +other: + if (pull_len != 0) { + skb_push(skb, pull_len); + skb->network_header -= pull_len; + } + + return (u16)(ntohs(protocol) % dev->real_num_tx_queues); + } + + if (addr1 > addr2) + swap(addr1, addr2); + + switch (ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: { + if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) { + if (ports.in16[0] > ports.in16[1]) + swap(ports.in16[0], ports.in16[1]); + break; + } + /* fall-through */ + } + default: + ports.in32 = 0; + break; + } + + if (pull_len != 0) { + skb_push(skb, pull_len); + skb->network_header -= pull_len; + } + + hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto); + + return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); +} + +static inline bool sk_tx_queue_recorded(struct sock *sk) +{ + return (sk_tx_queue_get(sk) >= 0); +} + +static struct netdev_queue *imq_select_queue(struct net_device *dev, + struct sk_buff *skb) +{ + u16 queue_index = 0; + u32 hash; + + if (likely(dev->real_num_tx_queues == 1)) + goto out; + + /* IMQ can be receiving ingress or engress packets. */ + + /* Check first for if rx_queue is set */ + if (skb_rx_queue_recorded(skb)) { + queue_index = skb_get_rx_queue(skb); + goto out; + } + + /* Check if socket has tx_queue set */ + if (sk_tx_queue_recorded(skb->sk)) { + queue_index = sk_tx_queue_get(skb->sk); + goto out; + } + + /* Try use socket hash */ + if (skb->sk && skb->sk->sk_hash) { + hash = skb->sk->sk_hash; + queue_index = + (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); + goto out; + } + + /* Generate hash from packet data */ + queue_index = imq_hash(dev, skb); + +out: + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues); + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +static struct net_device_stats *imq_get_stats(struct net_device *dev) +{ + return &dev->stats; +} + +/* called for packets kfree'd in qdiscs at places other than enqueue */ +static void imq_skb_destructor(struct sk_buff *skb) +{ + struct nf_queue_entry *entry = skb->nf_queue_entry; + + skb->nf_queue_entry = NULL; + + if (entry) { + nf_queue_entry_release_refs(entry); + kfree(entry); + } + + skb_restore_cb(skb); /* kfree backup */ +} + +static void imq_done_check_queue_mapping(struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int queue_index; + + /* Don't let queue_mapping be left too large after exiting IMQ */ + if (likely(skb->dev != dev && skb->dev != NULL)) { + queue_index = skb_get_queue_mapping(skb); + if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) { + queue_index = (u16)((u32)queue_index % + skb->dev->real_num_tx_queues); + skb_set_queue_mapping(skb, queue_index); + } + } else { + /* skb->dev was IMQ device itself or NULL, be on safe side and + * just clear queue mapping. + */ + skb_set_queue_mapping(skb, 0); + } +} + +static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct nf_queue_entry *entry = skb->nf_queue_entry; + + skb->nf_queue_entry = NULL; + dev->trans_start = jiffies; + + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + + if (unlikely(entry == NULL)) { + /* We don't know what is going on here.. packet is queued for + * imq device, but (probably) not by us. + * + * If this packet was not send here by imq_nf_queue(), then + * skb_save_cb() was not used and skb_free() should not show: + * WARNING: IMQ: kfree_skb: skb->cb_next:.. + * and/or + * WARNING: IMQ: kfree_skb: skb->nf_queue_entry... + * + * However if this message is shown, then IMQ is somehow broken + * and you should report this to linuximq.net. + */ + + /* imq_dev_xmit is black hole that eats all packets, report that + * we eat this packet happily and increase dropped counters. + */ + + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + + return NETDEV_TX_OK; + } + + skb_restore_cb(skb); /* restore skb->cb */ + + skb->imq_flags = 0; + skb->destructor = NULL; + + imq_done_check_queue_mapping(skb, dev); + + nf_reinject(entry, NF_ACCEPT); + + return NETDEV_TX_OK; +} + +static struct net_device *get_imq_device_by_index(int index) +{ + struct net_device *dev = NULL; + struct net *net; + char buf[8]; + + /* get device by name and cache result */ + snprintf(buf, sizeof(buf), "imq%d", index); + + /* Search device from all namespaces. */ + for_each_net(net) { + dev = dev_get_by_name(net, buf); + if (dev) + break; + } + + if (WARN_ON_ONCE(dev == NULL)) { + /* IMQ device not found. Exotic config? */ + return ERR_PTR(-ENODEV); + } + + imq_devs_cache[index] = dev; + dev_put(dev); + + return dev; +} + +static struct nf_queue_entry *nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) { + if (nf_queue_entry_get_refs(entry)) + return entry; + kfree(entry); + } + return NULL; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev); + +static int __imq_nf_queue_gso(struct nf_queue_entry *entry, + struct net_device *dev, struct sk_buff *skb) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __imq_nf_queue(entry, dev); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __imq_nf_queue(entry_seg, dev); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num) +{ + struct sk_buff *skb, *segs; + struct net_device *dev; + unsigned int queued; + int index, retval, err; + + index = entry->skb->imq_flags & IMQ_F_IFMASK; + if (unlikely(index > numdevs - 1)) { + if (net_ratelimit()) + pr_warn("IMQ: invalid device specified, highest is %u\n", + numdevs - 1); + retval = -EINVAL; + goto out_no_dev; + } + + /* check for imq device by index from cache */ + dev = imq_devs_cache[index]; + if (unlikely(!dev)) { + dev = get_imq_device_by_index(index); + if (IS_ERR(dev)) { + retval = PTR_ERR(dev); + goto out_no_dev; + } + } + + if (unlikely(!(dev->flags & IFF_UP))) { + entry->skb->imq_flags = 0; + retval = -ECANCELED; + goto out_no_dev; + } + + if (!skb_is_gso(entry->skb)) + return __imq_nf_queue(entry, dev); + + /* Since 3.10.x, GSO handling moved here as result of upstream commit + * a5fedd43d5f6c94c71053a66e4c3d2e35f1731a2 (netfilter: move + * skb_gso_segment into nfnetlink_queue module). + * + * Following code replicates the gso handling from + * 'net/netfilter/nfnetlink_queue_core.c':nfqnl_enqueue_packet(). + */ + + skb = entry->skb; + + switch (entry->pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to + * mean 'ignore this hook'. + */ + err = -ENOBUFS; + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (nskb && nskb->next) + nskb->cb_next = NULL; + if (err == 0) + err = __imq_nf_queue_gso(entry, dev, segs); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + +out_err: + nf_bridge_adjust_segmented_data(skb); + retval = err; +out_no_dev: + return retval; +} + +static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev) +{ + struct sk_buff *skb_orig, *skb, *skb_shared; + struct Qdisc *q; + struct netdev_queue *txq; + spinlock_t *root_lock; + int users; + int retval = -EINVAL; + unsigned int orig_queue_index; + + dev->last_rx = jiffies; + + skb = entry->skb; + skb_orig = NULL; + + /* skb has owner? => make clone */ + if (unlikely(skb->destructor)) { + skb_orig = skb; + skb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb)) { + retval = -ENOMEM; + goto out; + } + skb->cb_next = NULL; + entry->skb = skb; + } + + skb->nf_queue_entry = entry; + + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + + if (!skb->dev) { + /* skb->dev == NULL causes problems, try the find cause. */ + if (net_ratelimit()) { + dev_warn(&dev->dev, + "received packet with skb->dev == NULL\n"); + dump_stack(); + } + + skb->dev = dev; + } + + /* Disables softirqs for lock below */ + rcu_read_lock_bh(); + + /* Multi-queue selection */ + orig_queue_index = skb_get_queue_mapping(skb); + txq = imq_select_queue(dev, skb); + + q = rcu_dereference(txq->qdisc); + if (unlikely(!q->enqueue)) + goto packet_not_eaten_by_imq_dev; + + root_lock = qdisc_lock(q); + spin_lock(root_lock); + + users = atomic_read(&skb->users); + + skb_shared = skb_get(skb); /* increase reference count by one */ + + /* backup skb->cb, as qdisc layer will overwrite it */ + skb_save_cb(skb_shared); + qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ + + if (likely(atomic_read(&skb_shared->users) == users + 1)) { + kfree_skb(skb_shared); /* decrease reference count by one */ + + skb->destructor = &imq_skb_destructor; + + /* cloned? */ + if (unlikely(skb_orig)) + kfree_skb(skb_orig); /* free original */ + + spin_unlock(root_lock); + rcu_read_unlock_bh(); + + /* schedule qdisc dequeue */ + __netif_schedule(q); + + retval = 0; + goto out; + } else { + skb_restore_cb(skb_shared); /* restore skb->cb */ + skb->nf_queue_entry = NULL; + /* + * qdisc dropped packet and decreased skb reference count of + * skb, so we don't really want to and try refree as that would + * actually destroy the skb. + */ + spin_unlock(root_lock); + goto packet_not_eaten_by_imq_dev; + } + +packet_not_eaten_by_imq_dev: + skb_set_queue_mapping(skb, orig_queue_index); + rcu_read_unlock_bh(); + + /* cloned? restore original */ + if (unlikely(skb_orig)) { + kfree_skb(skb); + entry->skb = skb_orig; + } + retval = -1; +out: + return retval; +} + +static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, + const struct net_device *indev, + const struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT; +} + +static int imq_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static int imq_open(struct net_device *dev) +{ + netif_start_queue(dev); + return 0; +} + +static const struct net_device_ops imq_netdev_ops = { + .ndo_open = imq_open, + .ndo_stop = imq_close, + .ndo_start_xmit = imq_dev_xmit, + .ndo_get_stats = imq_get_stats, +}; + +static void imq_setup(struct net_device *dev) +{ + dev->netdev_ops = &imq_netdev_ops; + dev->type = ARPHRD_VOID; + dev->mtu = 16000; /* too small? */ + dev->tx_queue_len = 11000; /* too big? */ + dev->flags = IFF_NOARP; + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_GSO | NETIF_F_HW_CSUM | + NETIF_F_HIGHDMA; + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | + IFF_TX_SKB_SHARING); +} + +static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + int ret = 0; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { + ret = -EINVAL; + goto end; + } + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { + ret = -EADDRNOTAVAIL; + goto end; + } + } + return 0; +end: + pr_warn("IMQ: imq_validate failed (%d)\n", ret); + return ret; +} + +static struct rtnl_link_ops imq_link_ops __read_mostly = { + .kind = "imq", + .priv_size = 0, + .setup = imq_setup, + .validate = imq_validate, +}; + +static const struct nf_queue_handler imq_nfqh = { + .outfn = imq_nf_queue, +}; + +static int __init imq_init_hooks(void) +{ + int ret; + + nf_register_queue_imq_handler(&imq_nfqh); + + ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops)); + if (ret < 0) + nf_unregister_queue_imq_handler(); + + return ret; +} + +static int __init imq_init_one(int index) +{ + struct net_device *dev; + int ret; + + dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues); + if (!dev) + return -ENOMEM; + + ret = dev_alloc_name(dev, dev->name); + if (ret < 0) + goto fail; + + dev->rtnl_link_ops = &imq_link_ops; + ret = register_netdevice(dev); + if (ret < 0) + goto fail; + + return 0; +fail: + free_netdev(dev); + return ret; +} + +static int __init imq_init_devs(void) +{ + int err, i; + + if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { + pr_err("IMQ: numdevs has to be betweed 1 and %u\n", + IMQ_MAX_DEVS); + return -EINVAL; + } + + if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) { + pr_err("IMQ: numqueues has to be betweed 1 and %u\n", + IMQ_MAX_QUEUES); + return -EINVAL; + } + + get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd)); + + rtnl_lock(); + err = __rtnl_link_register(&imq_link_ops); + + for (i = 0; i < numdevs && !err; i++) + err = imq_init_one(i); + + if (err) { + __rtnl_link_unregister(&imq_link_ops); + memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); + } + rtnl_unlock(); + + return err; +} + +static int __init imq_init_module(void) +{ + int err; + +#if defined(CONFIG_IMQ_NUM_DEVS) + BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); + BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); + BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); +#endif + + err = imq_init_devs(); + if (err) { + pr_err("IMQ: Error trying imq_init_devs(net)\n"); + return err; + } + + err = imq_init_hooks(); + if (err) { + pr_err(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); + rtnl_link_unregister(&imq_link_ops); + memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); + return err; + } + + pr_info("IMQ driver loaded successfully. (numdevs = %d, numqueues = %d)\n", + numdevs, numqueues); + +#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) + pr_info("\tHooking IMQ before NAT on PREROUTING.\n"); +#else + pr_info("\tHooking IMQ after NAT on PREROUTING.\n"); +#endif +#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) + pr_info("\tHooking IMQ before NAT on POSTROUTING.\n"); +#else + pr_info("\tHooking IMQ after NAT on POSTROUTING.\n"); +#endif + + return 0; +} + +static void __exit imq_unhook(void) +{ + nf_unregister_hooks(imq_ops, ARRAY_SIZE(imq_ops)); + nf_unregister_queue_imq_handler(); +} + +static void __exit imq_cleanup_devs(void) +{ + rtnl_link_unregister(&imq_link_ops); + memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); +} + +static void __exit imq_exit_module(void) +{ + imq_unhook(); + imq_cleanup_devs(); + pr_info("IMQ driver unloaded successfully.\n"); +} + +module_init(imq_init_module); +module_exit(imq_exit_module); + +module_param(numdevs, int, 0); +module_param(numqueues, int, 0); +MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will be created)"); +MODULE_PARM_DESC(numqueues, "number of queues per IMQ device"); +MODULE_AUTHOR("http://www.linuximq.net"); +MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("imq"); + diff -Naur linux-3.10.30.org/drivers/net/Kconfig linux-3.10.30/drivers/net/Kconfig --- linux-3.10.30.org/drivers/net/Kconfig 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/drivers/net/Kconfig 2014-02-14 20:29:05.379402305 +0100 @@ -207,6 +207,125 @@ depends on RIONET default "128" +config IMQ + tristate "IMQ (intermediate queueing device) support" + depends on NETDEVICES && NETFILTER + ---help--- + The IMQ device(s) is used as placeholder for QoS queueing + disciplines. Every packet entering/leaving the IP stack can be + directed through the IMQ device where it's enqueued/dequeued to the + attached qdisc. This allows you to treat network devices as classes + and distribute bandwidth among them. Iptables is used to specify + through which IMQ device, if any, packets travel. + + More information at: http://www.linuximq.net/ + + To compile this driver as a module, choose M here: the module + will be called imq. If unsure, say N. + +choice + prompt "IMQ behavior (PRE/POSTROUTING)" + depends on IMQ + default IMQ_BEHAVIOR_AB + help + This setting defines how IMQ behaves in respect to its + hooking in PREROUTING and POSTROUTING. + + IMQ can work in any of the following ways: + + PREROUTING | POSTROUTING + -----------------|------------------- + #1 After NAT | After NAT + #2 After NAT | Before NAT + #3 Before NAT | After NAT + #4 Before NAT | Before NAT + + The default behavior is to hook before NAT on PREROUTING + and after NAT on POSTROUTING (#3). + + This settings are specially usefull when trying to use IMQ + to shape NATed clients. + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + +config IMQ_BEHAVIOR_AA + bool "IMQ AA" + help + This setting defines how IMQ behaves in respect to its + hooking in PREROUTING and POSTROUTING. + + Choosing this option will make IMQ hook like this: + + PREROUTING: After NAT + POSTROUTING: After NAT + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + +config IMQ_BEHAVIOR_AB + bool "IMQ AB" + help + This setting defines how IMQ behaves in respect to its + hooking in PREROUTING and POSTROUTING. + + Choosing this option will make IMQ hook like this: + + PREROUTING: After NAT + POSTROUTING: Before NAT + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + +config IMQ_BEHAVIOR_BA + bool "IMQ BA" + help + This setting defines how IMQ behaves in respect to its + hooking in PREROUTING and POSTROUTING. + + Choosing this option will make IMQ hook like this: + + PREROUTING: Before NAT + POSTROUTING: After NAT + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + +config IMQ_BEHAVIOR_BB + bool "IMQ BB" + help + This setting defines how IMQ behaves in respect to its + hooking in PREROUTING and POSTROUTING. + + Choosing this option will make IMQ hook like this: + + PREROUTING: Before NAT + POSTROUTING: Before NAT + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + +endchoice + +config IMQ_NUM_DEVS + int "Number of IMQ devices" + range 2 16 + depends on IMQ + default "16" + help + This setting defines how many IMQ devices will be created. + + The default value is 16. + + More information can be found at: www.linuximq.net + + If not sure leave the default settings alone. + config TUN tristate "Universal TUN/TAP device driver support" select CRC32 diff -Naur linux-3.10.30.org/drivers/net/Makefile linux-3.10.30/drivers/net/Makefile --- linux-3.10.30.org/drivers/net/Makefile 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/drivers/net/Makefile 2014-02-14 20:29:05.379402305 +0100 @@ -9,6 +9,7 @@ obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_EQUALIZER) += eql.o obj-$(CONFIG_IFB) += ifb.o +obj-$(CONFIG_IMQ) += imq.o obj-$(CONFIG_MACVLAN) += macvlan.o obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_MII) += mii.o diff -Naur linux-3.10.30.org/include/linux/imq.h linux-3.10.30/include/linux/imq.h --- linux-3.10.30.org/include/linux/imq.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/include/linux/imq.h 2014-02-14 20:29:05.379402305 +0100 @@ -0,0 +1,13 @@ +#ifndef _IMQ_H +#define _IMQ_H + +/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */ +#define IMQ_F_BITS 5 + +#define IMQ_F_IFMASK 0x0f +#define IMQ_F_ENQUEUE 0x10 + +#define IMQ_MAX_DEVS (IMQ_F_IFMASK + 1) + +#endif /* _IMQ_H */ + diff -Naur linux-3.10.30.org/include/linux/netfilter/xt_IMQ.h linux-3.10.30/include/linux/netfilter/xt_IMQ.h --- linux-3.10.30.org/include/linux/netfilter/xt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/include/linux/netfilter/xt_IMQ.h 2014-02-14 20:29:05.379402305 +0100 @@ -0,0 +1,9 @@ +#ifndef _XT_IMQ_H +#define _XT_IMQ_H + +struct xt_imq_info { + unsigned int todev; /* target imq device */ +}; + +#endif /* _XT_IMQ_H */ + diff -Naur linux-3.10.30.org/include/linux/netfilter_ipv4/ipt_IMQ.h linux-3.10.30/include/linux/netfilter_ipv4/ipt_IMQ.h --- linux-3.10.30.org/include/linux/netfilter_ipv4/ipt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/include/linux/netfilter_ipv4/ipt_IMQ.h 2014-02-14 20:29:05.379402305 +0100 @@ -0,0 +1,10 @@ +#ifndef _IPT_IMQ_H +#define _IPT_IMQ_H + +/* Backwards compatibility for old userspace */ +#include + +#define ipt_imq_info xt_imq_info + +#endif /* _IPT_IMQ_H */ + diff -Naur linux-3.10.30.org/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-3.10.30/include/linux/netfilter_ipv6/ip6t_IMQ.h --- linux-3.10.30.org/include/linux/netfilter_ipv6/ip6t_IMQ.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/include/linux/netfilter_ipv6/ip6t_IMQ.h 2014-02-14 20:29:05.379402305 +0100 @@ -0,0 +1,10 @@ +#ifndef _IP6T_IMQ_H +#define _IP6T_IMQ_H + +/* Backwards compatibility for old userspace */ +#include + +#define ip6t_imq_info xt_imq_info + +#endif /* _IP6T_IMQ_H */ + diff -Naur linux-3.10.30.org/include/linux/skbuff.h linux-3.10.30/include/linux/skbuff.h --- linux-3.10.30.org/include/linux/skbuff.h 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/include/linux/skbuff.h 2014-02-14 20:29:05.379402305 +0100 @@ -33,6 +33,9 @@ #include #include #include +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +#include +#endif /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 @@ -414,6 +417,9 @@ * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + void *cb_next; +#endif unsigned long _skb_refdst; #ifdef CONFIG_XFRM @@ -449,6 +455,9 @@ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + struct nf_queue_entry *nf_queue_entry; +#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif @@ -487,7 +496,9 @@ __u8 encapsulation:1; /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); - +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + __u8 imq_flags:IMQ_F_BITS; +#endif #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif @@ -616,7 +627,10 @@ { return (struct rtable *)skb_dst(skb); } - +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +extern int skb_save_cb(struct sk_buff *skb); +extern int skb_restore_cb(struct sk_buff *skb); +#endif extern void kfree_skb(struct sk_buff *skb); extern void kfree_skb_list(struct sk_buff *segs); extern void skb_tx_error(struct sk_buff *skb); @@ -2735,6 +2749,10 @@ nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + dst->imq_flags = src->imq_flags; + dst->nf_queue_entry = src->nf_queue_entry; +#endif #ifdef CONFIG_BRIDGE_NETFILTER dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); diff -Naur linux-3.10.30.org/include/net/netfilter/nf_queue.h linux-3.10.30/include/net/netfilter/nf_queue.h --- linux-3.10.30.org/include/net/netfilter/nf_queue.h 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/include/net/netfilter/nf_queue.h 2014-02-14 20:29:05.382736249 +0100 @@ -29,6 +29,12 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); +extern void nf_queue_entry_release_refs(struct nf_queue_entry *entry); + +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +extern void nf_register_queue_imq_handler(const struct nf_queue_handler *qh); +extern void nf_unregister_queue_imq_handler(void); +#endif bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_release_refs(struct nf_queue_entry *entry); diff -Naur linux-3.10.30.org/include/uapi/linux/netfilter.h linux-3.10.30/include/uapi/linux/netfilter.h --- linux-3.10.30.org/include/uapi/linux/netfilter.h 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/include/uapi/linux/netfilter.h 2014-02-14 20:29:05.382736249 +0100 @@ -13,7 +13,8 @@ #define NF_QUEUE 3 #define NF_REPEAT 4 #define NF_STOP 5 -#define NF_MAX_VERDICT NF_STOP +#define NF_IMQ_QUEUE 6 +#define NF_MAX_VERDICT NF_IMQ_QUEUE /* we overload the higher bits for encoding auxiliary data such as the queue * number or errno values. Not nice, but better than additional function diff -Naur linux-3.10.30.org/net/core/dev.c linux-3.10.30/net/core/dev.c --- linux-3.10.30.org/net/core/dev.c 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/core/dev.c 2014-02-14 20:29:05.382736249 +0100 @@ -129,6 +129,9 @@ #include #include #include +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +#include +#endif #include "net-sysfs.h" @@ -2573,7 +2576,12 @@ } } +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + if (!list_empty(&ptype_all) && + !(skb->imq_flags & IMQ_F_ENQUEUE)) +#else if (!list_empty(&ptype_all)) +#endif dev_queue_xmit_nit(skb, dev); skb_len = skb->len; diff -Naur linux-3.10.30.org/net/core/skbuff.c linux-3.10.30/net/core/skbuff.c --- linux-3.10.30.org/net/core/skbuff.c 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/core/skbuff.c 2014-02-14 21:47:17.286039229 +0100 @@ -73,6 +73,9 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +static struct kmem_cache *skbuff_cb_store_cache __read_mostly; +#endif /** * skb_panic - private function for out-of-line support @@ -552,6 +555,29 @@ WARN_ON(in_irq()); skb->destructor(skb); } +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + /* + * This should not happen. When it does, avoid memleak by restoring + * the chain of cb-backups. + */ + while (skb->cb_next != NULL) { + if (net_ratelimit()) + pr_warn("IMQ: kfree_skb: skb->cb_next: %08x\n", + (unsigned int)skb->cb_next); + + skb_restore_cb(skb); + } + /* + * This should not happen either, nf_queue_entry is nullified in + * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are + * leaking entry pointers, maybe memory. We don't know if this is + * pointer to already freed memory, or should this be freed. + * If this happens we need to add refcounting, etc for nf_queue_entry. + */ + if (skb->nf_queue_entry && net_ratelimit()) + pr_warn("%s\n", "IMQ: kfree_skb: skb->nf_queue_entry != NULL"); +#endif + #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif @@ -683,6 +709,10 @@ new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + new->cb_next = NULL; + /*skb_copy_stored_cb(new, old);*/ +#endif new->csum = old->csum; new->local_df = old->local_df; new->pkt_type = old->pkt_type; @@ -3050,6 +3080,15 @@ } EXPORT_SYMBOL_GPL(skb_gro_receive); +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +/* Control buffer save/restore for IMQ devices */ +struct skb_cb_table { + char cb[48] __aligned(8); + void *cb_next; + atomic_t refcnt; +}; +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -3063,6 +3102,13 @@ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache", + sizeof(struct skb_cb_table), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +#endif } /** @@ -3348,6 +3394,76 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +static DEFINE_SPINLOCK(skb_cb_store_lock); + +int skb_save_cb(struct sk_buff *skb) +{ + struct skb_cb_table *next; + + next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC); + if (!next) + return -ENOMEM; + + BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); + + memcpy(next->cb, skb->cb, sizeof(skb->cb)); + next->cb_next = skb->cb_next; + + atomic_set(&next->refcnt, 1); + + skb->cb_next = next; + return 0; +} +EXPORT_SYMBOL(skb_save_cb); + +int skb_restore_cb(struct sk_buff *skb) +{ + struct skb_cb_table *next; + + if (!skb->cb_next) + return 0; + + next = skb->cb_next; + + BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); + + memcpy(skb->cb, next->cb, sizeof(skb->cb)); + skb->cb_next = next->cb_next; + + spin_lock(&skb_cb_store_lock); + + if (atomic_dec_and_test(&next->refcnt)) + kmem_cache_free(skbuff_cb_store_cache, next); + + spin_unlock(&skb_cb_store_lock); + + return 0; +} +EXPORT_SYMBOL(skb_restore_cb); + +static void skb_copy_stored_cb(struct sk_buff *new, const struct sk_buff *__old) +{ + struct skb_cb_table *next; + struct sk_buff *old; + + if (!__old->cb_next) { + new->cb_next = NULL; + return; + } + + spin_lock(&skb_cb_store_lock); + + old = (struct sk_buff *)__old; + + next = old->cb_next; + atomic_inc(&next->refcnt); + new->cb_next = next; + + spin_unlock(&skb_cb_store_lock); +} +#endif + /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set diff -Naur linux-3.10.30.org/net/ipv6/ip6_output.c linux-3.10.30/net/ipv6/ip6_output.c --- linux-3.10.30.org/net/ipv6/ip6_output.c 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/ipv6/ip6_output.c 2014-02-14 20:29:05.392738001 +0100 @@ -89,9 +89,6 @@ struct in6_addr *nexthop; int ret; - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); @@ -168,6 +165,13 @@ return 0; } + /* + * IMQ-patch: moved setting skb->dev and skb->protocol from + * ip6_finish_output2 to fix crashing at netif_skb_features(). + */ + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); diff -Naur linux-3.10.30.org/net/ipv6/ip6_output.c.orig linux-3.10.30/net/ipv6/ip6_output.c.orig --- linux-3.10.30.org/net/ipv6/ip6_output.c.orig 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/net/ipv6/ip6_output.c.orig 2014-02-14 20:29:05.392738001 +0100 @@ -0,0 +1,1580 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); + +static int ip6_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + struct neighbour *neigh; + struct in6_addr *nexthop; + int ret; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + ((mroute6_socket(dev_net(dev), skb) && + !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || + ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr))) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + if (newskb) + NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + dev_loopback_xmit); + + if (ipv6_hdr(skb)->hop_limit == 0) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + } + + IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, + skb->len); + + if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= + IPV6_ADDR_SCOPE_NODELOCAL && + !(dev->flags & IFF_LOOPBACK)) { + kfree_skb(skb); + return 0; + } + } + + rcu_read_lock_bh(); + nexthop = rt6_nexthop((struct rt6_info *)dst); + neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + if (!IS_ERR(neigh)) { + ret = dst_neigh_output(dst, neigh, skb); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); + + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; +} + +static int ip6_finish_output(struct sk_buff *skb) +{ + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)) || + (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) + return ip6_fragment(skb, ip6_finish_output2); + else + return ip6_finish_output2(skb); +} + +int ip6_output(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +} + +/* + * xmit an sk_buff (used by TCP, SCTP and DCCP) + */ + +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + struct ipv6_txoptions *opt, int tclass) +{ + struct net *net = sock_net(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *first_hop = &fl6->daddr; + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr; + u8 proto = fl6->flowi6_proto; + int seg_len = skb->len; + int hlimit = -1; + u32 mtu; + + if (opt) { + unsigned int head_room; + + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (skb2 == NULL) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; + } + consume_skb(skb); + skb = skb2; + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + /* + * Fill in the IPv6 header + */ + if (np) + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + + hdr->payload_len = htons(seg_len); + hdr->nexthdr = proto; + hdr->hop_limit = hlimit; + + hdr->saddr = fl6->saddr; + hdr->daddr = *first_hop; + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + mtu = dst_mtu(dst); + if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUT, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + dst->dev, dst_output); + } + + skb->dev = dst->dev; + ipv6_local_error(sk, EMSGSIZE, fl6, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; +} + +EXPORT_SYMBOL(ip6_xmit); + +static int ip6_call_ra_chain(struct sk_buff *skb, int sel) +{ + struct ip6_ra_chain *ra; + struct sock *last = NULL; + + read_lock(&ip6_ra_lock); + for (ra = ip6_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + rawv6_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + rawv6_rcv(last, skb); + read_unlock(&ip6_ra_lock); + return 1; + } + read_unlock(&ip6_ra_lock); + return 0; +} + +static int ip6_forward_proxy_check(struct sk_buff *skb) +{ + struct ipv6hdr *hdr = ipv6_hdr(skb); + u8 nexthdr = hdr->nexthdr; + __be16 frag_off; + int offset; + + if (ipv6_ext_hdr(nexthdr)) { + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); + if (offset < 0) + return 0; + } else + offset = sizeof(struct ipv6hdr); + + if (nexthdr == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6; + + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) + return 0; + + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); + + switch (icmp6->icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + /* For reaction involving unicast neighbor discovery + * message destined to the proxied address, pass it to + * input function. + */ + return 1; + default: + break; + } + } + + /* + * The proxying router can't forward traffic sent to a link-local + * address, so signal the sender and discard the packet. This + * behavior is clarified by the MIPv6 specification. + */ + if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { + dst_link_failure(skb); + return -1; + } + + return 0; +} + +static inline int ip6_forward_finish(struct sk_buff *skb) +{ + return dst_output(skb); +} + +int ip6_forward(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(dst->dev); + u32 mtu; + + if (net->ipv6.devconf_all->forwarding == 0) + goto error; + + if (skb_warn_if_lro(skb)) + goto drop; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb_forward_csum(skb); + + /* + * We DO NOT make any processing on + * RA packets, pushing them to user level AS IS + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we + * cannot make anything clever here. + * + * We are not end-node, so that if packet contains + * AH/ESP, we cannot make anything. + * Defragmentation also would be mistake, RA packets + * cannot be fragmented, because there is no warranty + * that different fragments will go along one path. --ANK + */ + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { + if (ip6_call_ra_chain(skb, ntohs(opt->ra))) + return 0; + } + + /* + * check and decrement ttl + */ + if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + + kfree_skb(skb); + return -ETIMEDOUT; + } + + /* XXX: idev->cnf.proxy_ndp? */ + if (net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + int proxied = ip6_forward_proxy_check(skb); + if (proxied > 0) + return ip6_input(skb); + else if (proxied < 0) { + IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + + if (!xfrm6_route_forward(skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + dst = skb_dst(skb); + + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + We don't send redirects to frames decapsulated from IPsec. + */ + if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { + struct in6_addr *target = NULL; + struct inet_peer *peer; + struct rt6_info *rt; + + /* + * incoming and outgoing devices are the same + * send a redirect. + */ + + rt = (struct rt6_info *) dst; + if (rt->rt6i_flags & RTF_GATEWAY) + target = &rt->rt6i_gateway; + else + target = &hdr->daddr; + + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (inet_peer_xrlim_allow(peer, 1*HZ)) + ndisc_send_redirect(skb, target); + if (peer) + inet_putpeer(peer); + } else { + int addrtype = ipv6_addr_type(&hdr->saddr); + + /* This check is security critical. */ + if (addrtype == IPV6_ADDR_ANY || + addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) + goto error; + if (addrtype & IPV6_ADDR_LINKLOCAL) { + icmpv6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_NOT_NEIGHBOUR, 0); + goto error; + } + } + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || + (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + goto drop; + } + + hdr = ipv6_hdr(skb); + + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + ip6_forward_finish); + +error: + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); + to->dev = from->dev; + to->mark = from->mark; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif + nf_copy(to, from); +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) + to->nf_trace = from->nf_trace; +#endif + skb_copy_secmark(to, from); +} + +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); + struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + int hroom, troom; + __be32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + struct net *net = dev_net(skb_dst(skb)->dev); + + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = ip6_skb_dst_mtu(skb); + + /* We must not fragment if the socket is set to force MTU discovery + * or if the skb it not generated by a local socket. + */ + if (unlikely(!skb->local_df && skb->len > mtu) || + (IP6CB(skb)->frag_max_size && + IP6CB(skb)->frag_max_size > mtu)) { + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (np && np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + mtu -= hlen + sizeof(struct frag_hdr); + + if (skb_has_frag_list(skb)) { + int first_len = skb_pagelen(skb); + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path_clean; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path_clean; + + BUG_ON(frag->sk); + if (skb->sk) { + frag->sk = skb->sk; + frag->destructor = sock_wfree; + } + skb->truesize -= frag->truesize; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + /* BUILD HEADER */ + + *prevhdr = NEXTHDR_FRAGMENT; + tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + return -ENOMEM; + } + + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), tmp_hdr, hlen); + + ipv6_select_ident(fh, rt); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - + sizeof(struct ipv6hdr)); + + dst_hold(&rt->dst); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), tmp_hdr, + hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + ipv6_hdr(frag)->payload_len = + htons(frag->len - + sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = output(skb); + if(!err) + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGCREATES); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGOKS); + ip6_rt_put(rt); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGFAILS); + ip6_rt_put(rt); + return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } + } + +slow_path: + if ((skb->ip_summed == CHECKSUM_PARTIAL) && + skb_checksum_help(skb)) + goto fail; + + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + hroom = LL_RESERVED_SPACE(rt->dst.dev); + troom = rt->dst.dev->needed_tailroom; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC)) == NULL) { + NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, hroom); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); + frag->transport_header = (frag->network_header + hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (!frag_id) { + ipv6_select_ident(fh, rt); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - + sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + err = output(frag); + if (err) + goto fail; + + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGCREATES); + } + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGOKS); + consume_skb(skb); + return err; + +fail: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return err; +} + +static inline int ip6_rt_check(const struct rt6key *rt_key, + const struct in6_addr *fl_addr, + const struct in6_addr *addr_cache) +{ + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); +} + +static struct dst_entry *ip6_sk_dst_check(struct sock *sk, + struct dst_entry *dst, + const struct flowi6 *fl6) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct rt6_info *rt; + + if (!dst) + goto out; + + if (dst->ops->family != AF_INET6) { + dst_release(dst); + return NULL; + } + + rt = (struct rt6_info *)dst; + /* Yes, checking route validity in not connected + * case is not very simple. Take into account, + * that we do not support routing by source, TOS, + * and MSG_DONTROUTE --ANK (980726) + * + * 1. ip6_rt_check(): If route was host route, + * check that cached destination is current. + * If it is network route, we still may + * check its validity using saved pointer + * to the last used address: daddr_cache. + * We do not want to save whole address now, + * (because main consumer of this service + * is tcp, which has not this problem), + * so that the last trick works only on connected + * sockets. + * 2. oif also should be the same. + */ + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || +#ifdef CONFIG_IPV6_SUBTREES + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || +#endif + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + dst_release(dst); + dst = NULL; + } + +out: + return dst; +} + +static int ip6_dst_lookup_tail(struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6) +{ + struct net *net = sock_net(sk); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + struct neighbour *n; + struct rt6_info *rt; +#endif + int err; + + if (*dst == NULL) + *dst = ip6_route_output(net, sk, fl6); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl6->saddr)) { + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); + if (err) + goto out_err_release; + } + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + rt = (struct rt6_info *) *dst; + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; + rcu_read_unlock_bh(); + + if (err) { + struct inet6_ifaddr *ifp; + struct flowi6 fl_gw6; + int redirect; + + ifp = ipv6_get_ifaddr(net, &fl6->saddr, + (*dst)->dev, 1); + + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); + memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw6); + if ((err = (*dst)->error)) + goto out_err_release; + } + } +#endif + + return 0; + +out_err_release: + if (err == -ENETUNREACH) + IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); + dst_release(*dst); + *dst = NULL; + return err; +} + +/** + * ip6_dst_lookup - perform route lookup on flow + * @sk: socket which provides route info + * @dst: pointer to dst_entry * for result + * @fl6: flow to lookup + * + * This function performs a route lookup on the given flow. + * + * It returns zero on success, or a standard errno code on error. + */ +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) +{ + *dst = NULL; + return ip6_dst_lookup_tail(sk, dst, fl6); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + +/** + * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @sk: socket which provides route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = NULL; + int err; + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +/** + * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow + * @sk: socket which provides the dst cache and route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow with the + * possibility of using the cached route in the socket if it is valid. + * It will take the socket dst lock when operating on the dst cache. + * As a result, this function can only be used in process context. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + int err; + + dst = ip6_sk_dst_check(sk, dst, fl6); + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); + +static inline int ip6_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags, + struct rt6_info *rt) + +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP large send offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + struct frag_hdr fhdr; + + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb_reset_network_header(skb); + + /* initialize protocol header pointer */ + skb->transport_header = skb->network_header + fragheaderlen; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + + /* Specify the length of each IPv6 datagram fragment. + * It has to be a multiple of 8. + */ + skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - + sizeof(struct frag_hdr)) & ~7; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + ipv6_select_ident(&fhdr, rt); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + __skb_queue_tail(&sk->sk_write_queue, skb); + } + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); +} + +static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static void ip6_append_data_mtu(unsigned int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, + struct rt6_info *rt, + bool pmtuprobe) +{ + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { + /* first fragment, reserve header_len */ + *mtu = *mtu - rt->dst.header_len; + + } else { + /* + * this fragment is not first, the headers + * space is regarded as data space. + */ + *mtu = min(*mtu, pmtuprobe ? + rt->dst.dev->mtu : + dst_mtu(rt->dst.path)); + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); + } +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, + int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, int dontfrag) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen, mtu; + int exthdrlen; + int dst_exthdrlen; + int hh_len; + int copy; + int err; + int offset = 0; + __u8 tx_flags = 0; + + if (flags&MSG_PROBE) + return 0; + cork = &inet->cork.base; + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (WARN_ON(np->cork.opt)) + return -EINVAL; + + np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + + np->cork.opt->tot_len = opt->tot_len; + np->cork.opt->opt_flen = opt->opt_flen; + np->cork.opt->opt_nflen = opt->opt_nflen; + + np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, + sk->sk_allocation); + if (opt->dst0opt && !np->cork.opt->dst0opt) + return -ENOBUFS; + + np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, + sk->sk_allocation); + if (opt->dst1opt && !np->cork.opt->dst1opt) + return -ENOBUFS; + + np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, + sk->sk_allocation); + if (opt->hopopt && !np->cork.opt->hopopt) + return -ENOBUFS; + + np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, + sk->sk_allocation); + if (opt->srcrt && !np->cork.opt->srcrt) + return -ENOBUFS; + + /* need source address above miyazawa*/ + } + dst_hold(&rt->dst); + cork->dst = &rt->dst; + inet->cork.fl.u.ip6 = *fl6; + np->cork.hop_limit = hlimit; + np->cork.tclass = tclass; + if (rt->dst.flags & DST_XFRM_TUNNEL) + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + else + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + if (np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + cork->fragsize = mtu; + if (dst_allfrag(rt->dst.path)) + cork->flags |= IPCORK_ALLFRAG; + cork->length = 0; + exthdrlen = (opt ? opt->opt_flen : 0); + length += exthdrlen; + transhdrlen += exthdrlen; + dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; + } else { + rt = (struct rt6_info *)cork->dst; + fl6 = &inet->cork.fl.u.ip6; + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + dst_exthdrlen = 0; + mtu = cork->fragsize; + } + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + + fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + /* For UDP, check if TX timestamp is enabled */ + if (sk->sk_type == SOCK_DGRAM) + sock_tx_timestamp(sk, &tx_flags); + + /* + * Let's try using as much space as possible. + * Use MTU if total length of the message fits into the MTU. + * Otherwise, we need to reserve fragment header and + * fragment alignment (= 8-15 octects, in total). + * + * Note that we may need to "move" the data from the tail of + * of the buffer to the new fragment when we split + * the message. + * + * FIXME: It may be fragmented into multiple chunks + * at once if non-fragmentable extension headers + * are too large. + * --yoshfuji + */ + + if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_RAW)) { + ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + + skb = skb_peek_tail(&sk->sk_write_queue); + cork->length += length; + if (((length > mtu) || + (skb && skb_has_frags(skb))) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO)) { + err = ip6_ufo_append_data(sk, getfrag, from, length, + hh_len, fragheaderlen, + transhdrlen, mtu, flags, rt); + if (err) + goto error; + return 0; + } + + if (!skb) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; +alloc_new_skb: + /* There's no room in the current skb */ + if (skb) + fraggap = skb->len - maxfraglen; + else + fraggap = 0; + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, + fragheaderlen, skb, rt, + np->pmtudisc == + IPV6_PMTUDISC_PROBE); + + skb_prev = skb; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + + if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; + if ((flags & MSG_MORE) && + !(rt->dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + alloclen += dst_exthdrlen; + + if (datalen != length + fraggap) { + /* + * this is not the last fragment, the trailer + * space is regarded as data space. + */ + datalen += rt->dst.trailer_len; + } + + alloclen += rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; + + /* + * We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloclen += sizeof(struct frag_hdr); + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + else { + /* Only the initial fragment + * is time stamped. + */ + tx_flags = 0; + } + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + /* reserve for fragmentation and ipsec header */ + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + + dst_exthdrlen); + + if (sk->sk_type == SOCK_DGRAM) + skb_shinfo(skb)->tx_flags = tx_flags; + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb_set_network_header(skb, exthdrlen); + data += fragheaderlen; + skb->transport_header = (skb->network_header + + fragheaderlen); + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + pskb_trim_unique(skb_prev, maxfraglen); + } + copy = datalen - transhdrlen - fraggap; + + if (copy < 0) { + err = -EINVAL; + kfree_skb(skb); + goto error; + } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + dst_exthdrlen = 0; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); + + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) + goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); + } + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); + } + offset += copy; + length -= copy; + } + + return 0; + +error_efault: + err = -EFAULT; +error: + cork->length -= length; + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + return err; +} +EXPORT_SYMBOL_GPL(ip6_append_data); + +static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) +{ + if (np->cork.opt) { + kfree(np->cork.opt->dst0opt); + kfree(np->cork.opt->dst1opt); + kfree(np->cork.opt->hopopt); + kfree(np->cork.opt->srcrt); + kfree(np->cork.opt); + np->cork.opt = NULL; + } + + if (inet->cork.base.dst) { + dst_release(inet->cork.base.dst); + inet->cork.base.dst = NULL; + inet->cork.base.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + unsigned char proto = fl6->flowi6_proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb_network_header_len(skb)); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Allow local fragmentation. */ + if (np->pmtudisc < IPV6_PMTUDISC_DO) + skb->local_df = 1; + + *final_dst = fl6->daddr; + __skb_pull(skb, skb_network_header_len(skb)); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + hdr->saddr = fl6->saddr; + hdr->daddr = *final_dst; + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + skb_dst_set(skb, dst_clone(&rt->dst)); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + if (proto == IPPROTO_ICMPV6) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + } + + err = ip6_local_out(skb); + if (err) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; + } + +out: + ip6_cork_release(inet, np); + return err; +error: + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + goto out; +} +EXPORT_SYMBOL_GPL(ip6_push_pending_frames); + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { + if (skb_dst(skb)) + IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + } + + ip6_cork_release(inet_sk(sk), inet6_sk(sk)); +} +EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); diff -Naur linux-3.10.30.org/net/netfilter/core.c linux-3.10.30/net/netfilter/core.c --- linux-3.10.30.org/net/netfilter/core.c 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/netfilter/core.c 2014-02-14 20:29:05.392738001 +0100 @@ -191,9 +191,11 @@ ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; - } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE || + (verdict & NF_VERDICT_MASK) == NF_IMQ_QUEUE) { int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_QBITS); + verdict >> NF_VERDICT_QBITS, + verdict & NF_VERDICT_MASK); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff -Naur linux-3.10.30.org/net/netfilter/Kconfig linux-3.10.30/net/netfilter/Kconfig --- linux-3.10.30.org/net/netfilter/Kconfig 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/netfilter/Kconfig 2014-02-14 20:29:05.396071847 +0100 @@ -641,6 +641,18 @@ To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_IMQ + tristate '"IMQ" target support' + depends on NETFILTER_XTABLES + depends on IP_NF_MANGLE || IP6_NF_MANGLE + select IMQ + default m if NETFILTER_ADVANCED=n + help + This option adds a `IMQ' target which is used to specify if and + to which imq device packets should get enqueued/dequeued. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' depends on NETFILTER_ADVANCED diff -Naur linux-3.10.30.org/net/netfilter/Makefile linux-3.10.30/net/netfilter/Makefile --- linux-3.10.30.org/net/netfilter/Makefile 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/netfilter/Makefile 2014-02-14 20:29:05.396071847 +0100 @@ -82,6 +82,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o diff -Naur linux-3.10.30.org/net/netfilter/nf_internals.h linux-3.10.30/net/netfilter/nf_internals.h --- linux-3.10.30.org/net/netfilter/nf_internals.h 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/netfilter/nf_internals.h 2014-02-14 20:29:05.396071847 +0100 @@ -29,7 +29,7 @@ struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), - unsigned int queuenum); + unsigned int queuenum, unsigned int queuetype); extern int __init netfilter_queue_init(void); /* nf_log.c */ diff -Naur linux-3.10.30.org/net/netfilter/nf_queue.c linux-3.10.30/net/netfilter/nf_queue.c --- linux-3.10.30.org/net/netfilter/nf_queue.c 2014-02-13 22:48:15.000000000 +0100 +++ linux-3.10.30/net/netfilter/nf_queue.c 2014-02-14 20:29:05.396071847 +0100 @@ -27,6 +27,23 @@ */ static const struct nf_queue_handler __rcu *queue_handler __read_mostly; +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +static const struct nf_queue_handler __rcu *queue_imq_handler __read_mostly; + +void nf_register_queue_imq_handler(const struct nf_queue_handler *qh) +{ + rcu_assign_pointer(queue_imq_handler, qh); +} +EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler); + +void nf_unregister_queue_imq_handler(void) +{ + RCU_INIT_POINTER(queue_imq_handler, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler); +#endif + /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ void nf_register_queue_handler(const struct nf_queue_handler *qh) @@ -105,7 +122,8 @@ struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), - unsigned int queuenum) + unsigned int queuenum, + unsigned int queuetype) { int status = -ENOENT; struct nf_queue_entry *entry = NULL; @@ -115,7 +133,17 @@ /* QUEUE == DROP if no one is waiting, to be safe. */ rcu_read_lock(); - qh = rcu_dereference(queue_handler); + if (queuetype == NF_IMQ_QUEUE) { +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + qh = rcu_dereference(queue_imq_handler); +#else + BUG(); + goto err_unlock; +#endif + } else { + qh = rcu_dereference(queue_handler); + } + if (!qh) { status = -ESRCH; goto err_unlock; @@ -205,9 +233,11 @@ local_bh_enable(); break; case NF_QUEUE: + case NF_IMQ_QUEUE: err = nf_queue(skb, elem, entry->pf, entry->hook, entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_QBITS); + verdict >> NF_VERDICT_QBITS, + verdict & NF_VERDICT_MASK); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff -Naur linux-3.10.30.org/net/netfilter/xt_IMQ.c linux-3.10.30/net/netfilter/xt_IMQ.c --- linux-3.10.30.org/net/netfilter/xt_IMQ.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-3.10.30/net/netfilter/xt_IMQ.c 2014-02-14 20:29:05.396071847 +0100 @@ -0,0 +1,72 @@ +/* + * This target marks packets to be enqueued to an imq device + */ +#include +#include +#include +#include +#include + +static unsigned int imq_target(struct sk_buff *pskb, + const struct xt_action_param *par) +{ + const struct xt_imq_info *mr = par->targinfo; + + pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE; + + return XT_CONTINUE; +} + +static int imq_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_imq_info *mr = par->targinfo; + + if (mr->todev > IMQ_MAX_DEVS - 1) { + pr_warn("IMQ: invalid device specified, highest is %u\n", + IMQ_MAX_DEVS - 1); + return -EINVAL; + } + + return 0; +} + +static struct xt_target xt_imq_reg[] __read_mostly = { + { + .name = "IMQ", + .family = AF_INET, + .checkentry = imq_checkentry, + .target = imq_target, + .targetsize = sizeof(struct xt_imq_info), + .table = "mangle", + .me = THIS_MODULE + }, + { + .name = "IMQ", + .family = AF_INET6, + .checkentry = imq_checkentry, + .target = imq_target, + .targetsize = sizeof(struct xt_imq_info), + .table = "mangle", + .me = THIS_MODULE + }, +}; + +static int __init imq_init(void) +{ + return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); +} + +static void __exit imq_fini(void) +{ + xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); +} + +module_init(imq_init); +module_exit(imq_fini); + +MODULE_AUTHOR("http://www.linuximq.net"); +MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_IMQ"); +MODULE_ALIAS("ip6t_IMQ"); +