Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /*
  2  *      NET3    Protocol independent device support routines.
  3  *
  4  *              This program is free software; you can redistribute it and/or
  5  *              modify it under the terms of the GNU General Public License
  6  *              as published by the Free Software Foundation; either version
  7  *              2 of the License, or (at your option) any later version.
  8  *
  9  *      Derived from the non IP parts of dev.c 1.0.19
 10  *              Authors:        Ross Biro
 11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  *
 14  *      Additional Authors:
 15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
 16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
 17  *              David Hinds <dahinds@users.sourceforge.net>
 18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 19  *              Adam Sulmicki <adam@cfar.umd.edu>
 20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 21  *
 22  *      Changes:
 23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 24  *                                      to 2 if register_netdev gets called
 25  *                                      before net_dev_init & also removed a
 26  *                                      few lines of code in the process.
 27  *              Alan Cox        :       device private ioctl copies fields back.
 28  *              Alan Cox        :       Transmit queue code does relevant
 29  *                                      stunts to keep the queue safe.
 30  *              Alan Cox        :       Fixed double lock.
 31  *              Alan Cox        :       Fixed promisc NULL pointer trap
 32  *              ????????        :       Support the full private ioctl range
 33  *              Alan Cox        :       Moved ioctl permission check into
 34  *                                      drivers
 35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
 36  *              Alan Cox        :       100 backlog just doesn't cut it when
 37  *                                      you start doing multicast video 8)
 38  *              Alan Cox        :       Rewrote net_bh and list manager.
 39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 40  *              Alan Cox        :       Took out transmit every packet pass
 41  *                                      Saved a few bytes in the ioctl handler
 42  *              Alan Cox        :       Network driver sets packet type before
 43  *                                      calling netif_rx. Saves a function
 44  *                                      call a packet.
 45  *              Alan Cox        :       Hashed net_bh()
 46  *              Richard Kooijman:       Timestamp fixes.
 47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
 48  *              Alan Cox        :       Device lock protection.
 49  *              Alan Cox        :       Fixed nasty side effect of device close
 50  *                                      changes.
 51  *              Rudi Cilibrasi  :       Pass the right thing to
 52  *                                      set_mac_address()
 53  *              Dave Miller     :       32bit quantity for the device lock to
 54  *                                      make it work out on a Sparc.
 55  *              Bjorn Ekwall    :       Added KERNELD hack.
 56  *              Alan Cox        :       Cleaned up the backlog initialise.
 57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
 58  *                                      1 device.
 59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
 60  *                                      is no device open function.
 61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
 62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
 63  *              Cyrus Durgin    :       Cleaned for KMOD
 64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
 65  *                                      A network device unload needs to purge
 66  *                                      the backlog queue.
 67  *      Paul Rusty Russell      :       SIOCSIFNAME
 68  *              Pekka Riikonen  :       Netdev boot-time settings code
 69  *              Andrew Morton   :       Make unregister_netdevice wait
 70  *                                      indefinitely on dev->refcnt
 71  *              J Hadi Salim    :       - Backlog queue sampling
 72  *                                      - netif_rx() feedback
 73  */
 74 
 75 #include <asm/uaccess.h>
 76 #include <asm/system.h>
 77 #include <linux/bitops.h>
 78 #include <linux/capability.h>
 79 #include <linux/cpu.h>
 80 #include <linux/types.h>
 81 #include <linux/kernel.h>
 82 #include <linux/sched.h>
 83 #include <linux/mutex.h>
 84 #include <linux/string.h>
 85 #include <linux/mm.h>
 86 #include <linux/socket.h>
 87 #include <linux/sockios.h>
 88 #include <linux/errno.h>
 89 #include <linux/interrupt.h>
 90 #include <linux/if_ether.h>
 91 #include <linux/netdevice.h>
 92 #include <linux/etherdevice.h>
 93 #include <linux/notifier.h>
 94 #include <linux/skbuff.h>
 95 #include <net/net_namespace.h>
 96 #include <net/sock.h>
 97 #include <linux/rtnetlink.h>
 98 #include <linux/proc_fs.h>
 99 #include <linux/seq_file.h>
100 #include <linux/stat.h>
101 #include <linux/if_bridge.h>
102 #include <linux/if_macvlan.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/kmod.h>
109 #include <linux/module.h>
110 #include <linux/kallsyms.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 
123 #include "net-sysfs.h"
124 
125 /*
126  *      The list of packet types we will receive (as opposed to discard)
127  *      and the routines to invoke.
128  *
129  *      Why 16. Because with 16 the only overlap we get on a hash of the
130  *      low nibble of the protocol value is RARP/SNAP/X.25.
131  *
132  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
133  *             sure which should go first, but I bet it won't make much
134  *             difference if we are running VLANs.  The good news is that
135  *             this protocol won't be in the list unless compiled in, so
136  *             the average user (w/out VLANs) will not be adversely affected.
137  *             --BLG
138  *
139  *              0800    IP
140  *              8100    802.1Q VLAN
141  *              0001    802.3
142  *              0002    AX.25
143  *              0004    802.2
144  *              8035    RARP
145  *              0005    SNAP
146  *              0805    X.25
147  *              0806    ARP
148  *              8137    IPX
149  *              0009    Localtalk
150  *              86DD    IPv6
151  */
152 
153 #define PTYPE_HASH_SIZE (16)
154 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
155 
156 static DEFINE_SPINLOCK(ptype_lock);
157 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
158 static struct list_head ptype_all __read_mostly;        /* Taps */
159 
160 #ifdef CONFIG_NET_DMA
161 struct net_dma {
162         struct dma_client client;
163         spinlock_t lock;
164         cpumask_t channel_mask;
165         struct dma_chan *channels[NR_CPUS];
166 };
167 
168 static enum dma_state_client
169 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
170         enum dma_state state);
171 
172 static struct net_dma net_dma = {
173         .client = {
174                 .event_callback = netdev_dma_event,
175         },
176 };
177 #endif
178 
179 /*
180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
181  * semaphore.
182  *
183  * Pure readers hold dev_base_lock for reading.
184  *
185  * Writers must hold the rtnl semaphore while they loop through the
186  * dev_base_head list, and hold dev_base_lock for writing when they do the
187  * actual updates.  This allows pure readers to access the list even
188  * while a writer is preparing to update it.
189  *
190  * To put it another way, dev_base_lock is held for writing only to
191  * protect against pure readers; the rtnl semaphore provides the
192  * protection against other writers.
193  *
194  * See, for example usages, register_netdevice() and
195  * unregister_netdevice(), which must be called with the rtnl
196  * semaphore held.
197  */
198 DEFINE_RWLOCK(dev_base_lock);
199 
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 #define NETDEV_HASHBITS 8
203 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
204 
205 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
206 {
207         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
208         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
209 }
210 
211 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
212 {
213         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
214 }
215 
216 /* Device list insertion */
217 static int list_netdevice(struct net_device *dev)
218 {
219         struct net *net = dev->nd_net;
220 
221         ASSERT_RTNL();
222 
223         write_lock_bh(&dev_base_lock);
224         list_add_tail(&dev->dev_list, &net->dev_base_head);
225         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
226         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
227         write_unlock_bh(&dev_base_lock);
228         return 0;
229 }
230 
231 /* Device list removal */
232 static void unlist_netdevice(struct net_device *dev)
233 {
234         ASSERT_RTNL();
235 
236         /* Unlink dev from the device chain */
237         write_lock_bh(&dev_base_lock);
238         list_del(&dev->dev_list);
239         hlist_del(&dev->name_hlist);
240         hlist_del(&dev->index_hlist);
241         write_unlock_bh(&dev_base_lock);
242 }
243 
244 /*
245  *      Our notifier list
246  */
247 
248 static RAW_NOTIFIER_HEAD(netdev_chain);
249 
250 /*
251  *      Device drivers call our routines to queue packets here. We empty the
252  *      queue in the local softnet handler.
253  */
254 
255 DEFINE_PER_CPU(struct softnet_data, softnet_data);
256 
257 #ifdef CONFIG_DEBUG_LOCK_ALLOC
258 /*
259  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
260  * according to dev->type
261  */
262 static const unsigned short netdev_lock_type[] =
263         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
264          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
265          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
266          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
267          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
268          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
269          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
270          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
271          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
272          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
273          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
274          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
275          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
276          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
277          ARPHRD_NONE};
278 
279 static const char *netdev_lock_name[] =
280         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
281          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
282          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
283          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
284          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
285          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
286          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
287          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
288          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
289          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
290          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
291          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
292          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
293          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
294          "_xmit_NONE"};
295 
296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
297 
298 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
299 {
300         int i;
301 
302         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
303                 if (netdev_lock_type[i] == dev_type)
304                         return i;
305         /* the last key is used by default */
306         return ARRAY_SIZE(netdev_lock_type) - 1;
307 }
308 
309 static inline void netdev_set_lockdep_class(spinlock_t *lock,
310                                             unsigned short dev_type)
311 {
312         int i;
313 
314         i = netdev_lock_pos(dev_type);
315         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
316                                    netdev_lock_name[i]);
317 }
318 #else
319 static inline void netdev_set_lockdep_class(spinlock_t *lock,
320                                             unsigned short dev_type)
321 {
322 }
323 #endif
324 
325 /*******************************************************************************
326 
327                 Protocol management and registration routines
328 
329 *******************************************************************************/
330 
331 /*
332  *      Add a protocol ID to the list. Now that the input handler is
333  *      smarter we can dispense with all the messy stuff that used to be
334  *      here.
335  *
336  *      BEWARE!!! Protocol handlers, mangling input packets,
337  *      MUST BE last in hash buckets and checking protocol handlers
338  *      MUST start from promiscuous ptype_all chain in net_bh.
339  *      It is true now, do not change it.
340  *      Explanation follows: if protocol handler, mangling packet, will
341  *      be the first on list, it is not able to sense, that packet
342  *      is cloned and should be copied-on-write, so that it will
343  *      change it and subsequent readers will get broken packet.
344  *                                                      --ANK (980803)
345  */
346 
347 /**
348  *      dev_add_pack - add packet handler
349  *      @pt: packet type declaration
350  *
351  *      Add a protocol handler to the networking stack. The passed &packet_type
352  *      is linked into kernel lists and may not be freed until it has been
353  *      removed from the kernel lists.
354  *
355  *      This call does not sleep therefore it can not
356  *      guarantee all CPU's that are in middle of receiving packets
357  *      will see the new packet type (until the next received packet).
358  */
359 
360 void dev_add_pack(struct packet_type *pt)
361 {
362         int hash;
363 
364         spin_lock_bh(&ptype_lock);
365         if (pt->type == htons(ETH_P_ALL))
366                 list_add_rcu(&pt->list, &ptype_all);
367         else {
368                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
369                 list_add_rcu(&pt->list, &ptype_base[hash]);
370         }
371         spin_unlock_bh(&ptype_lock);
372 }
373 
374 /**
375  *      __dev_remove_pack        - remove packet handler
376  *      @pt: packet type declaration
377  *
378  *      Remove a protocol handler that was previously added to the kernel
379  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
380  *      from the kernel lists and can be freed or reused once this function
381  *      returns.
382  *
383  *      The packet type might still be in use by receivers
384  *      and must not be freed until after all the CPU's have gone
385  *      through a quiescent state.
386  */
387 void __dev_remove_pack(struct packet_type *pt)
388 {
389         struct list_head *head;
390         struct packet_type *pt1;
391 
392         spin_lock_bh(&ptype_lock);
393 
394         if (pt->type == htons(ETH_P_ALL))
395                 head = &ptype_all;
396         else
397                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
398 
399         list_for_each_entry(pt1, head, list) {
400                 if (pt == pt1) {
401                         list_del_rcu(&pt->list);
402                         goto out;
403                 }
404         }
405 
406         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
407 out:
408         spin_unlock_bh(&ptype_lock);
409 }
410 /**
411  *      dev_remove_pack  - remove packet handler
412  *      @pt: packet type declaration
413  *
414  *      Remove a protocol handler that was previously added to the kernel
415  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
416  *      from the kernel lists and can be freed or reused once this function
417  *      returns.
418  *
419  *      This call sleeps to guarantee that no CPU is looking at the packet
420  *      type after return.
421  */
422 void dev_remove_pack(struct packet_type *pt)
423 {
424         __dev_remove_pack(pt);
425 
426         synchronize_net();
427 }
428 
429 /******************************************************************************
430 
431                       Device Boot-time Settings Routines
432 
433 *******************************************************************************/
434 
435 /* Boot time configuration table */
436 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
437 
438 /**
439  *      netdev_boot_setup_add   - add new setup entry
440  *      @name: name of the device
441  *      @map: configured settings for the device
442  *
443  *      Adds new setup entry to the dev_boot_setup list.  The function
444  *      returns 0 on error and 1 on success.  This is a generic routine to
445  *      all netdevices.
446  */
447 static int netdev_boot_setup_add(char *name, struct ifmap *map)
448 {
449         struct netdev_boot_setup *s;
450         int i;
451 
452         s = dev_boot_setup;
453         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
454                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
455                         memset(s[i].name, 0, sizeof(s[i].name));
456                         strcpy(s[i].name, name);
457                         memcpy(&s[i].map, map, sizeof(s[i].map));
458                         break;
459                 }
460         }
461 
462         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
463 }
464 
465 /**
466  *      netdev_boot_setup_check - check boot time settings
467  *      @dev: the netdevice
468  *
469  *      Check boot time settings for the device.
470  *      The found settings are set for the device to be used
471  *      later in the device probing.
472  *      Returns 0 if no settings found, 1 if they are.
473  */
474 int netdev_boot_setup_check(struct net_device *dev)
475 {
476         struct netdev_boot_setup *s = dev_boot_setup;
477         int i;
478 
479         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
480                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
481                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
482                         dev->irq        = s[i].map.irq;
483                         dev->base_addr  = s[i].map.base_addr;
484                         dev->mem_start  = s[i].map.mem_start;
485                         dev->mem_end    = s[i].map.mem_end;
486                         return 1;
487                 }
488         }
489         return 0;
490 }
491 
492 
493 /**
494  *      netdev_boot_base        - get address from boot time settings
495  *      @prefix: prefix for network device
496  *      @unit: id for network device
497  *
498  *      Check boot time settings for the base address of device.
499  *      The found settings are set for the device to be used
500  *      later in the device probing.
501  *      Returns 0 if no settings found.
502  */
503 unsigned long netdev_boot_base(const char *prefix, int unit)
504 {
505         const struct netdev_boot_setup *s = dev_boot_setup;
506         char name[IFNAMSIZ];
507         int i;
508 
509         sprintf(name, "%s%d", prefix, unit);
510 
511         /*
512          * If device already registered then return base of 1
513          * to indicate not to probe for this interface
514          */
515         if (__dev_get_by_name(&init_net, name))
516                 return 1;
517 
518         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
519                 if (!strcmp(name, s[i].name))
520                         return s[i].map.base_addr;
521         return 0;
522 }
523 
524 /*
525  * Saves at boot time configured settings for any netdevice.
526  */
527 int __init netdev_boot_setup(char *str)
528 {
529         int ints[5];
530         struct ifmap map;
531 
532         str = get_options(str, ARRAY_SIZE(ints), ints);
533         if (!str || !*str)
534                 return 0;
535 
536         /* Save settings */
537         memset(&map, 0, sizeof(map));
538         if (ints[0] > 0)
539                 map.irq = ints[1];
540         if (ints[0] > 1)
541                 map.base_addr = ints[2];
542         if (ints[0] > 2)
543                 map.mem_start = ints[3];
544         if (ints[0] > 3)
545                 map.mem_end = ints[4];
546 
547         /* Add new entry to the list */
548         return netdev_boot_setup_add(str, &map);
549 }
550 
551 __setup("netdev=", netdev_boot_setup);
552 
553 /*******************************************************************************
554 
555                             Device Interface Subroutines
556 
557 *******************************************************************************/
558 
559 /**
560  *      __dev_get_by_name       - find a device by its name
561  *      @net: the applicable net namespace
562  *      @name: name to find
563  *
564  *      Find an interface by name. Must be called under RTNL semaphore
565  *      or @dev_base_lock. If the name is found a pointer to the device
566  *      is returned. If the name is not found then %NULL is returned. The
567  *      reference counters are not incremented so the caller must be
568  *      careful with locks.
569  */
570 
571 struct net_device *__dev_get_by_name(struct net *net, const char *name)
572 {
573         struct hlist_node *p;
574 
575         hlist_for_each(p, dev_name_hash(net, name)) {
576                 struct net_device *dev
577                         = hlist_entry(p, struct net_device, name_hlist);
578                 if (!strncmp(dev->name, name, IFNAMSIZ))
579                         return dev;
580         }
581         return NULL;
582 }
583 
584 /**
585  *      dev_get_by_name         - find a device by its name
586  *      @net: the applicable net namespace
587  *      @name: name to find
588  *
589  *      Find an interface by name. This can be called from any
590  *      context and does its own locking. The returned handle has
591  *      the usage count incremented and the caller must use dev_put() to
592  *      release it when it is no longer needed. %NULL is returned if no
593  *      matching device is found.
594  */
595 
596 struct net_device *dev_get_by_name(struct net *net, const char *name)
597 {
598         struct net_device *dev;
599 
600         read_lock(&dev_base_lock);
601         dev = __dev_get_by_name(net, name);
602         if (dev)
603                 dev_hold(dev);
604         read_unlock(&dev_base_lock);
605         return dev;
606 }
607 
608 /**
609  *      __dev_get_by_index - find a device by its ifindex
610  *      @net: the applicable net namespace
611  *      @ifindex: index of device
612  *
613  *      Search for an interface by index. Returns %NULL if the device
614  *      is not found or a pointer to the device. The device has not
615  *      had its reference counter increased so the caller must be careful
616  *      about locking. The caller must hold either the RTNL semaphore
617  *      or @dev_base_lock.
618  */
619 
620 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
621 {
622         struct hlist_node *p;
623 
624         hlist_for_each(p, dev_index_hash(net, ifindex)) {
625                 struct net_device *dev
626                         = hlist_entry(p, struct net_device, index_hlist);
627                 if (dev->ifindex == ifindex)
628                         return dev;
629         }
630         return NULL;
631 }
632 
633 
634 /**
635  *      dev_get_by_index - find a device by its ifindex
636  *      @net: the applicable net namespace
637  *      @ifindex: index of device
638  *
639  *      Search for an interface by index. Returns NULL if the device
640  *      is not found or a pointer to the device. The device returned has
641  *      had a reference added and the pointer is safe until the user calls
642  *      dev_put to indicate they have finished with it.
643  */
644 
645 struct net_device *dev_get_by_index(struct net *net, int ifindex)
646 {
647         struct net_device *dev;
648 
649         read_lock(&dev_base_lock);
650         dev = __dev_get_by_index(net, ifindex);
651         if (dev)
652                 dev_hold(dev);
653         read_unlock(&dev_base_lock);
654         return dev;
655 }
656 
657 /**
658  *      dev_getbyhwaddr - find a device by its hardware address
659  *      @net: the applicable net namespace
660  *      @type: media type of device
661  *      @ha: hardware address
662  *
663  *      Search for an interface by MAC address. Returns NULL if the device
664  *      is not found or a pointer to the device. The caller must hold the
665  *      rtnl semaphore. The returned device has not had its ref count increased
666  *      and the caller must therefore be careful about locking
667  *
668  *      BUGS:
669  *      If the API was consistent this would be __dev_get_by_hwaddr
670  */
671 
672 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
673 {
674         struct net_device *dev;
675 
676         ASSERT_RTNL();
677 
678         for_each_netdev(net, dev)
679                 if (dev->type == type &&
680                     !memcmp(dev->dev_addr, ha, dev->addr_len))
681                         return dev;
682 
683         return NULL;
684 }
685 
686 EXPORT_SYMBOL(dev_getbyhwaddr);
687 
688 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
689 {
690         struct net_device *dev;
691 
692         ASSERT_RTNL();
693         for_each_netdev(net, dev)
694                 if (dev->type == type)
695                         return dev;
696 
697         return NULL;
698 }
699 
700 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
701 
702 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
703 {
704         struct net_device *dev;
705 
706         rtnl_lock();
707         dev = __dev_getfirstbyhwtype(net, type);
708         if (dev)
709                 dev_hold(dev);
710         rtnl_unlock();
711         return dev;
712 }
713 
714 EXPORT_SYMBOL(dev_getfirstbyhwtype);
715 
716 /**
717  *      dev_get_by_flags - find any device with given flags
718  *      @net: the applicable net namespace
719  *      @if_flags: IFF_* values
720  *      @mask: bitmask of bits in if_flags to check
721  *
722  *      Search for any interface with the given flags. Returns NULL if a device
723  *      is not found or a pointer to the device. The device returned has
724  *      had a reference added and the pointer is safe until the user calls
725  *      dev_put to indicate they have finished with it.
726  */
727 
728 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
729 {
730         struct net_device *dev, *ret;
731 
732         ret = NULL;
733         read_lock(&dev_base_lock);
734         for_each_netdev(net, dev) {
735                 if (((dev->flags ^ if_flags) & mask) == 0) {
736                         dev_hold(dev);
737                         ret = dev;
738                         break;
739                 }
740         }
741         read_unlock(&dev_base_lock);
742         return ret;
743 }
744 
745 /**
746  *      dev_valid_name - check if name is okay for network device
747  *      @name: name string
748  *
749  *      Network device names need to be valid file names to
750  *      to allow sysfs to work.  We also disallow any kind of
751  *      whitespace.
752  */
753 int dev_valid_name(const char *name)
754 {
755         if (*name == '\0')
756                 return 0;
757         if (strlen(name) >= IFNAMSIZ)
758                 return 0;
759         if (!strcmp(name, ".") || !strcmp(name, ".."))
760                 return 0;
761 
762         while (*name) {
763                 if (*name == '/' || isspace(*name))
764                         return 0;
765                 name++;
766         }
767         return 1;
768 }
769 
770 /**
771  *      __dev_alloc_name - allocate a name for a device
772  *      @net: network namespace to allocate the device name in
773  *      @name: name format string
774  *      @buf:  scratch buffer and result name string
775  *
776  *      Passed a format string - eg "lt%d" it will try and find a suitable
777  *      id. It scans list of devices to build up a free map, then chooses
778  *      the first empty slot. The caller must hold the dev_base or rtnl lock
779  *      while allocating the name and adding the device in order to avoid
780  *      duplicates.
781  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
782  *      Returns the number of the unit assigned or a negative errno code.
783  */
784 
785 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
786 {
787         int i = 0;
788         const char *p;
789         const int max_netdevices = 8*PAGE_SIZE;
790         unsigned long *inuse;
791         struct net_device *d;
792 
793         p = strnchr(name, IFNAMSIZ-1, '%');
794         if (p) {
795                 /*
796                  * Verify the string as this thing may have come from
797                  * the user.  There must be either one "%d" and no other "%"
798                  * characters.
799                  */
800                 if (p[1] != 'd' || strchr(p + 2, '%'))
801                         return -EINVAL;
802 
803                 /* Use one page as a bit array of possible slots */
804                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
805                 if (!inuse)
806                         return -ENOMEM;
807 
808                 for_each_netdev(net, d) {
809                         if (!sscanf(d->name, name, &i))
810                                 continue;
811                         if (i < 0 || i >= max_netdevices)
812                                 continue;
813 
814                         /*  avoid cases where sscanf is not exact inverse of printf */
815                         snprintf(buf, IFNAMSIZ, name, i);
816                         if (!strncmp(buf, d->name, IFNAMSIZ))
817                                 set_bit(i, inuse);
818                 }
819 
820                 i = find_first_zero_bit(inuse, max_netdevices);
821                 free_page((unsigned long) inuse);
822         }
823 
824         snprintf(buf, IFNAMSIZ, name, i);
825         if (!__dev_get_by_name(net, buf))
826                 return i;
827 
828         /* It is possible to run out of possible slots
829          * when the name is long and there isn't enough space left
830          * for the digits, or if all bits are used.
831          */
832         return -ENFILE;
833 }
834 
835 /**
836  *      dev_alloc_name - allocate a name for a device
837  *      @dev: device
838  *      @name: name format string
839  *
840  *      Passed a format string - eg "lt%d" it will try and find a suitable
841  *      id. It scans list of devices to build up a free map, then chooses
842  *      the first empty slot. The caller must hold the dev_base or rtnl lock
843  *      while allocating the name and adding the device in order to avoid
844  *      duplicates.
845  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
846  *      Returns the number of the unit assigned or a negative errno code.
847  */
848 
849 int dev_alloc_name(struct net_device *dev, const char *name)
850 {
851         char buf[IFNAMSIZ];
852         struct net *net;
853         int ret;
854 
855         BUG_ON(!dev->nd_net);
856         net = dev->nd_net;
857         ret = __dev_alloc_name(net, name, buf);
858         if (ret >= 0)
859                 strlcpy(dev->name, buf, IFNAMSIZ);
860         return ret;
861 }
862 
863 
864 /**
865  *      dev_change_name - change name of a device
866  *      @dev: device
867  *      @newname: name (or format string) must be at least IFNAMSIZ
868  *
869  *      Change name of a device, can pass format strings "eth%d".
870  *      for wildcarding.
871  */
872 int dev_change_name(struct net_device *dev, char *newname)
873 {
874         char oldname[IFNAMSIZ];
875         int err = 0;
876         int ret;
877         struct net *net;
878 
879         ASSERT_RTNL();
880         BUG_ON(!dev->nd_net);
881 
882         net = dev->nd_net;
883         if (dev->flags & IFF_UP)
884                 return -EBUSY;
885 
886         if (!dev_valid_name(newname))
887                 return -EINVAL;
888 
889         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
890                 return 0;
891 
892         memcpy(oldname, dev->name, IFNAMSIZ);
893 
894         if (strchr(newname, '%')) {
895                 err = dev_alloc_name(dev, newname);
896                 if (err < 0)
897                         return err;
898                 strcpy(newname, dev->name);
899         }
900         else if (__dev_get_by_name(net, newname))
901                 return -EEXIST;
902         else
903                 strlcpy(dev->name, newname, IFNAMSIZ);
904 
905 rollback:
906         device_rename(&dev->dev, dev->name);
907 
908         write_lock_bh(&dev_base_lock);
909         hlist_del(&dev->name_hlist);
910         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
911         write_unlock_bh(&dev_base_lock);
912 
913         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
914         ret = notifier_to_errno(ret);
915 
916         if (ret) {
917                 if (err) {
918                         printk(KERN_ERR
919                                "%s: name change rollback failed: %d.\n",
920                                dev->name, ret);
921                 } else {
922                         err = ret;
923                         memcpy(dev->name, oldname, IFNAMSIZ);
924                         goto rollback;
925                 }
926         }
927 
928         return err;
929 }
930 
931 /**
932  *      netdev_features_change - device changes features
933  *      @dev: device to cause notification
934  *
935  *      Called to indicate a device has changed features.
936  */
937 void netdev_features_change(struct net_device *dev)
938 {
939         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
940 }
941 EXPORT_SYMBOL(netdev_features_change);
942 
943 /**
944  *      netdev_state_change - device changes state
945  *      @dev: device to cause notification
946  *
947  *      Called to indicate a device has changed state. This function calls
948  *      the notifier chains for netdev_chain and sends a NEWLINK message
949  *      to the routing socket.
950  */
951 void netdev_state_change(struct net_device *dev)
952 {
953         if (dev->flags & IFF_UP) {
954                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
955                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
956         }
957 }
958 
959 /**
960  *      dev_load        - load a network module
961  *      @net: the applicable net namespace
962  *      @name: name of interface
963  *
964  *      If a network interface is not present and the process has suitable
965  *      privileges this function loads the module. If module loading is not
966  *      available in this kernel then it becomes a nop.
967  */
968 
969 void dev_load(struct net *net, const char *name)
970 {
971         struct net_device *dev;
972 
973         read_lock(&dev_base_lock);
974         dev = __dev_get_by_name(net, name);
975         read_unlock(&dev_base_lock);
976 
977         if (!dev && capable(CAP_SYS_MODULE))
978                 request_module("%s", name);
979 }
980 
981 /**
982  *      dev_open        - prepare an interface for use.
983  *      @dev:   device to open
984  *
985  *      Takes a device from down to up state. The device's private open
986  *      function is invoked and then the multicast lists are loaded. Finally
987  *      the device is moved into the up state and a %NETDEV_UP message is
988  *      sent to the netdev notifier chain.
989  *
990  *      Calling this function on an active interface is a nop. On a failure
991  *      a negative errno code is returned.
992  */
993 int dev_open(struct net_device *dev)
994 {
995         int ret = 0;
996 
997         /*
998          *      Is it already up?
999          */
1000 
1001         if (dev->flags & IFF_UP)
1002                 return 0;
1003 
1004         /*
1005          *      Is it even present?
1006          */
1007         if (!netif_device_present(dev))
1008                 return -ENODEV;
1009 
1010         /*
1011          *      Call device private open method
1012          */
1013         set_bit(__LINK_STATE_START, &dev->state);
1014 
1015         if (dev->validate_addr)
1016                 ret = dev->validate_addr(dev);
1017 
1018         if (!ret && dev->open)
1019                 ret = dev->open(dev);
1020 
1021         /*
1022          *      If it went open OK then:
1023          */
1024 
1025         if (ret)
1026                 clear_bit(__LINK_STATE_START, &dev->state);
1027         else {
1028                 /*
1029                  *      Set the flags.
1030                  */
1031                 dev->flags |= IFF_UP;
1032 
1033                 /*
1034                  *      Initialize multicasting status
1035                  */
1036                 dev_set_rx_mode(dev);
1037 
1038                 /*
1039                  *      Wakeup transmit queue engine
1040                  */
1041                 dev_activate(dev);
1042 
1043                 /*
1044                  *      ... and announce new interface.
1045                  */
1046                 call_netdevice_notifiers(NETDEV_UP, dev);
1047         }
1048 
1049         return ret;
1050 }
1051 
1052 /**
1053  *      dev_close - shutdown an interface.
1054  *      @dev: device to shutdown
1055  *
1056  *      This function moves an active device into down state. A
1057  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1058  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1059  *      chain.
1060  */
1061 int dev_close(struct net_device *dev)
1062 {
1063         might_sleep();
1064 
1065         if (!(dev->flags & IFF_UP))
1066                 return 0;
1067 
1068         /*
1069          *      Tell people we are going down, so that they can
1070          *      prepare to death, when device is still operating.
1071          */
1072         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1073 
1074         clear_bit(__LINK_STATE_START, &dev->state);
1075 
1076         /* Synchronize to scheduled poll. We cannot touch poll list,
1077          * it can be even on different cpu. So just clear netif_running().
1078          *
1079          * dev->stop() will invoke napi_disable() on all of it's
1080          * napi_struct instances on this device.
1081          */
1082         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1083 
1084         dev_deactivate(dev);
1085 
1086         /*
1087          *      Call the device specific close. This cannot fail.
1088          *      Only if device is UP
1089          *
1090          *      We allow it to be called even after a DETACH hot-plug
1091          *      event.
1092          */
1093         if (dev->stop)
1094                 dev->stop(dev);
1095 
1096         /*
1097          *      Device is now down.
1098          */
1099 
1100         dev->flags &= ~IFF_UP;
1101 
1102         /*
1103          * Tell people we are down
1104          */
1105         call_netdevice_notifiers(NETDEV_DOWN, dev);
1106 
1107         return 0;
1108 }
1109 
1110 
1111 static int dev_boot_phase = 1;
1112 
1113 /*
1114  *      Device change register/unregister. These are not inline or static
1115  *      as we export them to the world.
1116  */
1117 
1118 /**
1119  *      register_netdevice_notifier - register a network notifier block
1120  *      @nb: notifier
1121  *
1122  *      Register a notifier to be called when network device events occur.
1123  *      The notifier passed is linked into the kernel structures and must
1124  *      not be reused until it has been unregistered. A negative errno code
1125  *      is returned on a failure.
1126  *
1127  *      When registered all registration and up events are replayed
1128  *      to the new notifier to allow device to have a race free
1129  *      view of the network device list.
1130  */
1131 
1132 int register_netdevice_notifier(struct notifier_block *nb)
1133 {
1134         struct net_device *dev;
1135         struct net_device *last;
1136         struct net *net;
1137         int err;
1138 
1139         rtnl_lock();
1140         err = raw_notifier_chain_register(&netdev_chain, nb);
1141         if (err)
1142                 goto unlock;
1143         if (dev_boot_phase)
1144                 goto unlock;
1145         for_each_net(net) {
1146                 for_each_netdev(net, dev) {
1147                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1148                         err = notifier_to_errno(err);
1149                         if (err)
1150                                 goto rollback;
1151 
1152                         if (!(dev->flags & IFF_UP))
1153                                 continue;
1154 
1155                         nb->notifier_call(nb, NETDEV_UP, dev);
1156                 }
1157         }
1158 
1159 unlock:
1160         rtnl_unlock();
1161         return err;
1162 
1163 rollback:
1164         last = dev;
1165         for_each_net(net) {
1166                 for_each_netdev(net, dev) {
1167                         if (dev == last)
1168                                 break;
1169 
1170                         if (dev->flags & IFF_UP) {
1171                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1172                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1173                         }
1174                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1175                 }
1176         }
1177 
1178         raw_notifier_chain_unregister(&netdev_chain, nb);
1179         goto unlock;
1180 }
1181 
1182 /**
1183  *      unregister_netdevice_notifier - unregister a network notifier block
1184  *      @nb: notifier
1185  *
1186  *      Unregister a notifier previously registered by
1187  *      register_netdevice_notifier(). The notifier is unlinked into the
1188  *      kernel structures and may then be reused. A negative errno code
1189  *      is returned on a failure.
1190  */
1191 
1192 int unregister_netdevice_notifier(struct notifier_block *nb)
1193 {
1194         int err;
1195 
1196         rtnl_lock();
1197         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1198         rtnl_unlock();
1199         return err;
1200 }
1201 
1202 /**
1203  *      call_netdevice_notifiers - call all network notifier blocks
1204  *      @val: value passed unmodified to notifier function
1205  *      @dev: net_device pointer passed unmodified to notifier function
1206  *
1207  *      Call all network notifier blocks.  Parameters and return value
1208  *      are as for raw_notifier_call_chain().
1209  */
1210 
1211 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1212 {
1213         return raw_notifier_call_chain(&netdev_chain, val, dev);
1214 }
1215 
1216 /* When > 0 there are consumers of rx skb time stamps */
1217 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1218 
1219 void net_enable_timestamp(void)
1220 {
1221         atomic_inc(&netstamp_needed);
1222 }
1223 
1224 void net_disable_timestamp(void)
1225 {
1226         atomic_dec(&netstamp_needed);
1227 }
1228 
1229 static inline void net_timestamp(struct sk_buff *skb)
1230 {
1231         if (atomic_read(&netstamp_needed))
1232                 __net_timestamp(skb);
1233         else
1234                 skb->tstamp.tv64 = 0;
1235 }
1236 
1237 /*
1238  *      Support routine. Sends outgoing frames to any network
1239  *      taps currently in use.
1240  */
1241 
1242 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1243 {
1244         struct packet_type *ptype;
1245 
1246         net_timestamp(skb);
1247 
1248         rcu_read_lock();
1249         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1250                 /* Never send packets back to the socket
1251                  * they originated from - MvS (miquels@drinkel.ow.org)
1252                  */
1253                 if ((ptype->dev == dev || !ptype->dev) &&
1254                     (ptype->af_packet_priv == NULL ||
1255                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1256                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1257                         if (!skb2)
1258                                 break;
1259 
1260                         /* skb->nh should be correctly
1261                            set by sender, so that the second statement is
1262                            just protection against buggy protocols.
1263                          */
1264                         skb_reset_mac_header(skb2);
1265 
1266                         if (skb_network_header(skb2) < skb2->data ||
1267                             skb2->network_header > skb2->tail) {
1268                                 if (net_ratelimit())
1269                                         printk(KERN_CRIT "protocol %04x is "
1270                                                "buggy, dev %s\n",
1271                                                skb2->protocol, dev->name);
1272                                 skb_reset_network_header(skb2);
1273                         }
1274 
1275                         skb2->transport_header = skb2->network_header;
1276                         skb2->pkt_type = PACKET_OUTGOING;
1277                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1278                 }
1279         }
1280         rcu_read_unlock();
1281 }
1282 
1283 
1284 void __netif_schedule(struct net_device *dev)
1285 {
1286         if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1287                 unsigned long flags;
1288                 struct softnet_data *sd;
1289 
1290                 local_irq_save(flags);
1291                 sd = &__get_cpu_var(softnet_data);
1292                 dev->next_sched = sd->output_queue;
1293                 sd->output_queue = dev;
1294                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1295                 local_irq_restore(flags);
1296         }
1297 }
1298 EXPORT_SYMBOL(__netif_schedule);
1299 
1300 void dev_kfree_skb_irq(struct sk_buff *skb)
1301 {
1302         if (atomic_dec_and_test(&skb->users)) {
1303                 struct softnet_data *sd;
1304                 unsigned long flags;
1305 
1306                 local_irq_save(flags);
1307                 sd = &__get_cpu_var(softnet_data);
1308                 skb->next = sd->completion_queue;
1309                 sd->completion_queue = skb;
1310                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1311                 local_irq_restore(flags);
1312         }
1313 }
1314 EXPORT_SYMBOL(dev_kfree_skb_irq);
1315 
1316 void dev_kfree_skb_any(struct sk_buff *skb)
1317 {
1318         if (in_irq() || irqs_disabled())
1319                 dev_kfree_skb_irq(skb);
1320         else
1321                 dev_kfree_skb(skb);
1322 }
1323 EXPORT_SYMBOL(dev_kfree_skb_any);
1324 
1325 
1326 /**
1327  * netif_device_detach - mark device as removed
1328  * @dev: network device
1329  *
1330  * Mark device as removed from system and therefore no longer available.
1331  */
1332 void netif_device_detach(struct net_device *dev)
1333 {
1334         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1335             netif_running(dev)) {
1336                 netif_stop_queue(dev);
1337         }
1338 }
1339 EXPORT_SYMBOL(netif_device_detach);
1340 
1341 /**
1342  * netif_device_attach - mark device as attached
1343  * @dev: network device
1344  *
1345  * Mark device as attached from system and restart if needed.
1346  */
1347 void netif_device_attach(struct net_device *dev)
1348 {
1349         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1350             netif_running(dev)) {
1351                 netif_wake_queue(dev);
1352                 __netdev_watchdog_up(dev);
1353         }
1354 }
1355 EXPORT_SYMBOL(netif_device_attach);
1356 
1357 
1358 /*
1359  * Invalidate hardware checksum when packet is to be mangled, and
1360  * complete checksum manually on outgoing path.
1361  */
1362 int skb_checksum_help(struct sk_buff *skb)
1363 {
1364         __wsum csum;
1365         int ret = 0, offset;
1366 
1367         if (skb->ip_summed == CHECKSUM_COMPLETE)
1368                 goto out_set_summed;
1369 
1370         if (unlikely(skb_shinfo(skb)->gso_size)) {
1371                 /* Let GSO fix up the checksum. */
1372                 goto out_set_summed;
1373         }
1374 
1375         offset = skb->csum_start - skb_headroom(skb);
1376         BUG_ON(offset >= skb_headlen(skb));
1377         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1378 
1379         offset += skb->csum_offset;
1380         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1381 
1382         if (skb_cloned(skb) &&
1383             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1384                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1385                 if (ret)
1386                         goto out;
1387         }
1388 
1389         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1390 out_set_summed:
1391         skb->ip_summed = CHECKSUM_NONE;
1392 out:
1393         return ret;
1394 }
1395 
1396 /**
1397  *      skb_gso_segment - Perform segmentation on skb.
1398  *      @skb: buffer to segment
1399  *      @features: features for the output path (see dev->features)
1400  *
1401  *      This function segments the given skb and returns a list of segments.
1402  *
1403  *      It may return NULL if the skb requires no segmentation.  This is
1404  *      only possible when GSO is used for verifying header integrity.
1405  */
1406 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1407 {
1408         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1409         struct packet_type *ptype;
1410         __be16 type = skb->protocol;
1411         int err;
1412 
1413         BUG_ON(skb_shinfo(skb)->frag_list);
1414 
1415         skb_reset_mac_header(skb);
1416         skb->mac_len = skb->network_header - skb->mac_header;
1417         __skb_pull(skb, skb->mac_len);
1418 
1419         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1420                 if (skb_header_cloned(skb) &&
1421                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1422                         return ERR_PTR(err);
1423         }
1424 
1425         rcu_read_lock();
1426         list_for_each_entry_rcu(ptype,
1427                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1428                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1429                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1430                                 err = ptype->gso_send_check(skb);
1431                                 segs = ERR_PTR(err);
1432                                 if (err || skb_gso_ok(skb, features))
1433                                         break;
1434                                 __skb_push(skb, (skb->data -
1435                                                  skb_network_header(skb)));
1436                         }
1437                         segs = ptype->gso_segment(skb, features);
1438                         break;
1439                 }
1440         }
1441         rcu_read_unlock();
1442 
1443         __skb_push(skb, skb->data - skb_mac_header(skb));
1444 
1445         return segs;
1446 }
1447 
1448 EXPORT_SYMBOL(skb_gso_segment);
1449 
1450 /* Take action when hardware reception checksum errors are detected. */
1451 #ifdef CONFIG_BUG
1452 void netdev_rx_csum_fault(struct net_device *dev)
1453 {
1454         if (net_ratelimit()) {
1455                 printk(KERN_ERR "%s: hw csum failure.\n",
1456                         dev ? dev->name : "<unknown>");
1457                 dump_stack();
1458         }
1459 }
1460 EXPORT_SYMBOL(netdev_rx_csum_fault);
1461 #endif
1462 
1463 /* Actually, we should eliminate this check as soon as we know, that:
1464  * 1. IOMMU is present and allows to map all the memory.
1465  * 2. No high memory really exists on this machine.
1466  */
1467 
1468 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1469 {
1470 #ifdef CONFIG_HIGHMEM
1471         int i;
1472 
1473         if (dev->features & NETIF_F_HIGHDMA)
1474                 return 0;
1475 
1476         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1477                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1478                         return 1;
1479 
1480 #endif
1481         return 0;
1482 }
1483 
1484 struct dev_gso_cb {
1485         void (*destructor)(struct sk_buff *skb);
1486 };
1487 
1488 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1489 
1490 static void dev_gso_skb_destructor(struct sk_buff *skb)
1491 {
1492         struct dev_gso_cb *cb;
1493 
1494         do {
1495                 struct sk_buff *nskb = skb->next;
1496 
1497                 skb->next = nskb->next;
1498                 nskb->next = NULL;
1499                 kfree_skb(nskb);
1500         } while (skb->next);
1501 
1502         cb = DEV_GSO_CB(skb);
1503         if (cb->destructor)
1504                 cb->destructor(skb);
1505 }
1506 
1507 /**
1508  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1509  *      @skb: buffer to segment
1510  *
1511  *      This function segments the given skb and stores the list of segments
1512  *      in skb->next.
1513  */
1514 static int dev_gso_segment(struct sk_buff *skb)
1515 {
1516         struct net_device *dev = skb->dev;
1517         struct sk_buff *segs;
1518         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1519                                          NETIF_F_SG : 0);
1520 
1521         segs = skb_gso_segment(skb, features);
1522 
1523         /* Verifying header integrity only. */
1524         if (!segs)
1525                 return 0;
1526 
1527         if (unlikely(IS_ERR(segs)))
1528                 return PTR_ERR(segs);
1529 
1530         skb->next = segs;
1531         DEV_GSO_CB(skb)->destructor = skb->destructor;
1532         skb->destructor = dev_gso_skb_destructor;
1533 
1534         return 0;
1535 }
1536 
1537 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1538 {
1539         if (likely(!skb->next)) {
1540                 if (!list_empty(&ptype_all))
1541                         dev_queue_xmit_nit(skb, dev);
1542 
1543                 if (netif_needs_gso(dev, skb)) {
1544                         if (unlikely(dev_gso_segment(skb)))
1545                                 goto out_kfree_skb;
1546                         if (skb->next)
1547                                 goto gso;
1548                 }
1549 
1550                 return dev->hard_start_xmit(skb, dev);
1551         }
1552 
1553 gso:
1554         do {
1555                 struct sk_buff *nskb = skb->next;
1556                 int rc;
1557 
1558                 skb->next = nskb->next;
1559                 nskb->next = NULL;
1560                 rc = dev->hard_start_xmit(nskb, dev);
1561                 if (unlikely(rc)) {
1562                         nskb->next = skb->next;
1563                         skb->next = nskb;
1564                         return rc;
1565                 }
1566                 if (unlikely((netif_queue_stopped(dev) ||
1567                              netif_subqueue_stopped(dev, skb)) &&
1568                              skb->next))
1569                         return NETDEV_TX_BUSY;
1570         } while (skb->next);
1571 
1572         skb->destructor = DEV_GSO_CB(skb)->destructor;
1573 
1574 out_kfree_skb:
1575         kfree_skb(skb);
1576         return 0;
1577 }
1578 
1579 /**
1580  *      dev_queue_xmit - transmit a buffer
1581  *      @skb: buffer to transmit
1582  *
1583  *      Queue a buffer for transmission to a network device. The caller must
1584  *      have set the device and priority and built the buffer before calling
1585  *      this function. The function can be called from an interrupt.
1586  *
1587  *      A negative errno code is returned on a failure. A success does not
1588  *      guarantee the frame will be transmitted as it may be dropped due
1589  *      to congestion or traffic shaping.
1590  *
1591  * -----------------------------------------------------------------------------------
1592  *      I notice this method can also return errors from the queue disciplines,
1593  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1594  *      be positive.
1595  *
1596  *      Regardless of the return value, the skb is consumed, so it is currently
1597  *      difficult to retry a send to this method.  (You can bump the ref count
1598  *      before sending to hold a reference for retry if you are careful.)
1599  *
1600  *      When calling this method, interrupts MUST be enabled.  This is because
1601  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1602  *          --BLG
1603  */
1604 
1605 int dev_queue_xmit(struct sk_buff *skb)
1606 {
1607         struct net_device *dev = skb->dev;
1608         struct Qdisc *q;
1609         int rc = -ENOMEM;
1610 
1611         /* GSO will handle the following emulations directly. */
1612         if (netif_needs_gso(dev, skb))
1613                 goto gso;
1614 
1615         if (skb_shinfo(skb)->frag_list &&
1616             !(dev->features & NETIF_F_FRAGLIST) &&
1617             __skb_linearize(skb))
1618                 goto out_kfree_skb;
1619 
1620         /* Fragmented skb is linearized if device does not support SG,
1621          * or if at least one of fragments is in highmem and device
1622          * does not support DMA from it.
1623          */
1624         if (skb_shinfo(skb)->nr_frags &&
1625             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1626             __skb_linearize(skb))
1627                 goto out_kfree_skb;
1628 
1629         /* If packet is not checksummed and device does not support
1630          * checksumming for this protocol, complete checksumming here.
1631          */
1632         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1633                 skb_set_transport_header(skb, skb->csum_start -
1634                                               skb_headroom(skb));
1635 
1636                 if (!(dev->features & NETIF_F_GEN_CSUM) &&
1637                     !((dev->features & NETIF_F_IP_CSUM) &&
1638                       skb->protocol == htons(ETH_P_IP)) &&
1639                     !((dev->features & NETIF_F_IPV6_CSUM) &&
1640                       skb->protocol == htons(ETH_P_IPV6)))
1641                         if (skb_checksum_help(skb))
1642                                 goto out_kfree_skb;
1643         }
1644 
1645 gso:
1646         spin_lock_prefetch(&dev->queue_lock);
1647 
1648         /* Disable soft irqs for various locks below. Also
1649          * stops preemption for RCU.
1650          */
1651         rcu_read_lock_bh();
1652 
1653         /* Updates of qdisc are serialized by queue_lock.
1654          * The struct Qdisc which is pointed to by qdisc is now a
1655          * rcu structure - it may be accessed without acquiring
1656          * a lock (but the structure may be stale.) The freeing of the
1657          * qdisc will be deferred until it's known that there are no
1658          * more references to it.
1659          *
1660          * If the qdisc has an enqueue function, we still need to
1661          * hold the queue_lock before calling it, since queue_lock
1662          * also serializes access to the device queue.
1663          */
1664 
1665         q = rcu_dereference(dev->qdisc);
1666 #ifdef CONFIG_NET_CLS_ACT
1667         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1668 #endif
1669         if (q->enqueue) {
1670                 /* Grab device queue */
1671                 spin_lock(&dev->queue_lock);
1672                 q = dev->qdisc;
1673                 if (q->enqueue) {
1674                         /* reset queue_mapping to zero */
1675                         skb_set_queue_mapping(skb, 0);
1676                         rc = q->enqueue(skb, q);
1677                         qdisc_run(dev);
1678                         spin_unlock(&dev->queue_lock);
1679 
1680                         rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1681                         goto out;
1682                 }
1683                 spin_unlock(&dev->queue_lock);
1684         }
1685 
1686         /* The device has no queue. Common case for software devices:
1687            loopback, all the sorts of tunnels...
1688 
1689            Really, it is unlikely that netif_tx_lock protection is necessary
1690            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1691            counters.)
1692            However, it is possible, that they rely on protection
1693            made by us here.
1694 
1695            Check this and shot the lock. It is not prone from deadlocks.
1696            Either shot noqueue qdisc, it is even simpler 8)
1697          */
1698         if (dev->flags & IFF_UP) {
1699 
1700                 if (dev->xmit_lock_owner != (void *)current) {
1701 
1702                         HARD_TX_LOCK(dev);
1703 
1704                         if (!netif_queue_stopped(dev) &&
1705                             !netif_subqueue_stopped(dev, skb)) {
1706                                 rc = 0;
1707                                 if (!dev_hard_start_xmit(skb, dev)) {
1708                                         HARD_TX_UNLOCK(dev);
1709                                         goto out;
1710                                 }
1711                         }
1712                         HARD_TX_UNLOCK(dev);
1713                         if (net_ratelimit())
1714                                 printk(KERN_CRIT "Virtual device %s asks to "
1715                                        "queue packet!\n", dev->name);
1716                 } else {
1717                         /* Recursion is detected! It is possible,
1718                          * unfortunately */
1719                         if (net_ratelimit())
1720                                 printk(KERN_CRIT "Dead loop on virtual device "
1721                                        "%s, fix it urgently!\n", dev->name);
1722                 }
1723         }
1724 
1725         rc = -ENETDOWN;
1726         rcu_read_unlock_bh();
1727 
1728 out_kfree_skb:
1729         kfree_skb(skb);
1730         return rc;
1731 out:
1732         rcu_read_unlock_bh();
1733         return rc;
1734 }
1735 
1736 
1737 /*=======================================================================
1738                         Receiver routines
1739   =======================================================================*/
1740 
1741 int netdev_max_backlog __read_mostly = 1000;
1742 int netdev_budget __read_mostly = 300;
1743 int weight_p __read_mostly = 64;            /* old backlog weight */
1744 
1745 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1746 
1747 
1748 /**
1749  *      netif_rx        -       post buffer to the network code
1750  *      @skb: buffer to post
1751  *
1752  *      This function receives a packet from a device driver and queues it for
1753  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1754  *      may be dropped during processing for congestion control or by the
1755  *      protocol layers.
1756  *
1757  *      return values:
1758  *      NET_RX_SUCCESS  (no congestion)
1759  *      NET_RX_DROP     (packet was dropped)
1760  *
1761  */
1762 
1763 int netif_rx(struct sk_buff *skb)
1764 {
1765         struct softnet_data *queue;
1766         unsigned long flags;
1767 
1768         /* if netpoll wants it, pretend we never saw it */
1769         if (netpoll_rx(skb))
1770                 return NET_RX_DROP;
1771 
1772         if (!skb->tstamp.tv64)
1773                 net_timestamp(skb);
1774 
1775         /*
1776          * The code is rearranged so that the path is the most
1777          * short when CPU is congested, but is still operating.
1778          */
1779         local_irq_save(flags);
1780         queue = &__get_cpu_var(softnet_data);
1781 
1782         __get_cpu_var(netdev_rx_stat).total++;
1783         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1784                 if (queue->input_pkt_queue.qlen) {
1785 enqueue:
1786                         dev_hold(skb->dev);
1787                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1788                         local_irq_restore(flags);
1789                         return NET_RX_SUCCESS;
1790                 }
1791 
1792                 napi_schedule(&queue->backlog);
1793                 goto enqueue;
1794         }
1795 
1796         __get_cpu_var(netdev_rx_stat).dropped++;
1797         local_irq_restore(flags);
1798 
1799         kfree_skb(skb);
1800         return NET_RX_DROP;
1801 }
1802 
1803 int netif_rx_ni(struct sk_buff *skb)
1804 {
1805         int err;
1806 
1807         err = netif_rx(skb);
1808         preempt_disable();
1809         if (local_softirq_pending())
1810                 do_softirq();
1811         preempt_enable();
1812 
1813         return err;
1814 }
1815 
1816 EXPORT_SYMBOL(netif_rx_ni);
1817 
1818 static inline struct net_device *skb_bond(struct sk_buff *skb)
1819 {
1820         struct net_device *dev = skb->dev;
1821 
1822         if (dev->master) {
1823                 if (skb_bond_should_drop(skb)) {
1824                         kfree_skb(skb);
1825                         return NULL;
1826                 }
1827                 skb->dev = dev->master;
1828         }
1829 
1830         return dev;
1831 }
1832 
1833 
1834 static void net_tx_action(struct softirq_action *h)
1835 {
1836         struct softnet_data *sd = &per_cpu(softnet_data,
1837                                            raw_smp_processor_id());
1838 
1839         if (sd->completion_queue) {
1840                 struct sk_buff *clist;
1841 
1842                 local_irq_disable();
1843                 clist = sd->completion_queue;
1844                 sd->completion_queue = NULL;
1845                 local_irq_enable();
1846 
1847                 while (clist) {
1848                         struct sk_buff *skb = clist;
1849                         clist = clist->next;
1850 
1851                         BUG_TRAP(!atomic_read(&skb->users));
1852                         __kfree_skb(skb);
1853                         /*
1854                          * Safe to reschedule - the list is private
1855                          * at this point.
1856                          */
1857                         cond_resched_softirq_context();
1858                 }
1859         }
1860 
1861         if (sd->output_queue) {
1862                 struct net_device *head;
1863 
1864                 local_irq_disable();
1865                 head = sd->output_queue;
1866                 sd->output_queue = NULL;
1867                 local_irq_enable();
1868 
1869                 while (head) {
1870                         struct net_device *dev = head;
1871                         head = head->next_sched;
1872 
1873                         smp_mb__before_clear_bit();
1874                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1875 
1876                         /*
1877                          * We are executing in softirq context here, and
1878                          * if softirqs are preemptible, we must avoid
1879                          * infinite reactivation of the softirq by
1880                          * either the tx handler, or by netif_schedule().
1881                          * (it would result in an infinitely looping
1882                          *  softirq context)
1883                          * So we take the spinlock unconditionally.
1884                          */
1885 #ifdef CONFIG_PREEMPT_SOFTIRQS
1886                         spin_lock(&dev->queue_lock);
1887                         qdisc_run(dev);
1888                         spin_unlock(&dev->queue_lock);
1889 #else
1890                         if (spin_trylock(&dev->queue_lock)) {
1891                                 qdisc_run(dev);
1892                                 spin_unlock(&dev->queue_lock);
1893                         } else {
1894                                 netif_schedule(dev);
1895                         }
1896 #endif
1897                 }
1898         }
1899 }
1900 
1901 static inline int deliver_skb(struct sk_buff *skb,
1902                               struct packet_type *pt_prev,
1903                               struct net_device *orig_dev)
1904 {
1905         atomic_inc(&skb->users);
1906         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1907 }
1908 
1909 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1910 /* These hooks defined here for ATM */
1911 struct net_bridge;
1912 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1913                                                 unsigned char *addr);
1914 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1915 
1916 /*
1917  * If bridge module is loaded call bridging hook.
1918  *  returns NULL if packet was consumed.
1919  */
1920 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1921                                         struct sk_buff *skb) __read_mostly;
1922 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1923                                             struct packet_type **pt_prev, int *ret,
1924                                             struct net_device *orig_dev)
1925 {
1926         struct net_bridge_port *port;
1927 
1928         if (skb->pkt_type == PACKET_LOOPBACK ||
1929             (port = rcu_dereference(skb->dev->br_port)) == NULL)
1930                 return skb;
1931 
1932         if (*pt_prev) {
1933                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1934                 *pt_prev = NULL;
1935         }
1936 
1937         return br_handle_frame_hook(port, skb);
1938 }
1939 #else
1940 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
1941 #endif
1942 
1943 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1944 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1945 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1946 
1947 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1948                                              struct packet_type **pt_prev,
1949                                              int *ret,
1950                                              struct net_device *orig_dev)
1951 {
1952         if (skb->dev->macvlan_port == NULL)
1953                 return skb;
1954 
1955         if (*pt_prev) {
1956                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1957                 *pt_prev = NULL;
1958         }
1959         return macvlan_handle_frame_hook(skb);
1960 }
1961 #else
1962 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
1963 #endif
1964 
1965 #ifdef CONFIG_NET_CLS_ACT
1966 /* TODO: Maybe we should just force sch_ingress to be compiled in
1967  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1968  * a compare and 2 stores extra right now if we dont have it on
1969  * but have CONFIG_NET_CLS_ACT
1970  * NOTE: This doesnt stop any functionality; if you dont have
1971  * the ingress scheduler, you just cant add policies on ingress.
1972  *
1973  */
1974 static int ing_filter(struct sk_buff *skb)
1975 {
1976         struct Qdisc *q;
1977         struct net_device *dev = skb->dev;
1978         int result = TC_ACT_OK;
1979         u32 ttl = G_TC_RTTL(skb->tc_verd);
1980 
1981         if (MAX_RED_LOOP < ttl++) {
1982                 printk(KERN_WARNING
1983                        "Redir loop detected Dropping packet (%d->%d)\n",
1984                        skb->iif, dev->ifindex);
1985                 return TC_ACT_SHOT;
1986         }
1987 
1988         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
1989         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1990 
1991         spin_lock(&dev->ingress_lock);
1992         if ((q = dev->qdisc_ingress) != NULL)
1993                 result = q->enqueue(skb, q);
1994         spin_unlock(&dev->ingress_lock);
1995 
1996         return result;
1997 }
1998 
1999 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2000                                          struct packet_type **pt_prev,
2001                                          int *ret, struct net_device *orig_dev)
2002 {
2003         if (!skb->dev->qdisc_ingress)
2004                 goto out;
2005 
2006         if (*pt_prev) {
2007                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2008                 *pt_prev = NULL;
2009         } else {
2010                 /* Huh? Why does turning on AF_PACKET affect this? */
2011                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2012         }
2013 
2014         switch (ing_filter(skb)) {
2015         case TC_ACT_SHOT:
2016         case TC_ACT_STOLEN:
2017                 kfree_skb(skb);
2018                 return NULL;
2019         }
2020 
2021 out:
2022         skb->tc_verd = 0;
2023         return skb;
2024 }
2025 #endif
2026 
2027 /**
2028  *      netif_receive_skb - process receive buffer from network
2029  *      @skb: buffer to process
2030  *
2031  *      netif_receive_skb() is the main receive data processing function.
2032  *      It always succeeds. The buffer may be dropped during processing
2033  *      for congestion control or by the protocol layers.
2034  *
2035  *      This function may only be called from softirq context and interrupts
2036  *      should be enabled.
2037  *
2038  *      Return values (usually ignored):
2039  *      NET_RX_SUCCESS: no congestion
2040  *      NET_RX_DROP: packet was dropped
2041  */
2042 int netif_receive_skb(struct sk_buff *skb)
2043 {
2044         struct packet_type *ptype, *pt_prev;
2045         struct net_device *orig_dev;
2046         int ret = NET_RX_DROP;
2047         __be16 type;
2048 
2049         /* if we've gotten here through NAPI, check netpoll */
2050         if (netpoll_receive_skb(skb))
2051                 return NET_RX_DROP;
2052 
2053         if (!skb->tstamp.tv64)
2054                 net_timestamp(skb);
2055 
2056         if (!skb->iif)
2057                 skb->iif = skb->dev->ifindex;
2058 
2059         orig_dev = skb_bond(skb);
2060 
2061         if (!orig_dev)
2062                 return NET_RX_DROP;
2063 
2064         per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++;
2065 
2066         skb_reset_network_header(skb);
2067         skb_reset_transport_header(skb);
2068         skb->mac_len = skb->network_header - skb->mac_header;
2069 
2070         pt_prev = NULL;
2071 
2072         rcu_read_lock();
2073 
2074 #ifdef CONFIG_NET_CLS_ACT
2075         if (skb->tc_verd & TC_NCLS) {
2076                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2077                 goto ncls;
2078         }
2079 #endif
2080 
2081         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2082                 if (!ptype->dev || ptype->dev == skb->dev) {
2083                         if (pt_prev)
2084                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2085                         pt_prev = ptype;
2086                 }
2087         }
2088 
2089 #ifdef CONFIG_NET_CLS_ACT
2090         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2091         if (!skb)
2092                 goto out;
2093 ncls:
2094 #endif
2095 
2096         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2097         if (!skb)
2098                 goto out;
2099         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2100         if (!skb)
2101                 goto out;
2102 
2103         type = skb->protocol;
2104         list_for_each_entry_rcu(ptype,
2105                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2106                 if (ptype->type == type &&
2107                     (!ptype->dev || ptype->dev == skb->dev)) {
2108                         if (pt_prev)
2109                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2110                         pt_prev = ptype;
2111                 }
2112         }
2113 
2114         if (pt_prev) {
2115                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2116         } else {
2117                 kfree_skb(skb);
2118                 /* Jamal, now you will not able to escape explaining
2119                  * me how you were going to use this. :-)
2120                  */
2121                 ret = NET_RX_DROP;
2122         }
2123 
2124 out:
2125         rcu_read_unlock();
2126         return ret;
2127 }
2128 
2129 static int process_backlog(struct napi_struct *napi, int quota)
2130 {
2131         int work = 0;
2132         struct softnet_data *queue;
2133         unsigned long start_time = jiffies;
2134 
2135         queue = &per_cpu(softnet_data, raw_smp_processor_id());
2136         napi->weight = weight_p;
2137         do {
2138                 struct sk_buff *skb;
2139                 struct net_device *dev;
2140 
2141                 local_irq_disable();
2142                 skb = __skb_dequeue(&queue->input_pkt_queue);
2143                 if (!skb) {
2144                         __napi_complete(napi);
2145                         local_irq_enable();
2146                         break;
2147                 }
2148 
2149                 local_irq_enable();
2150 
2151                 dev = skb->dev;
2152 
2153                 netif_receive_skb(skb);
2154 
2155                 dev_put(dev);
2156         } while (++work < quota && jiffies == start_time);
2157 
2158         return work;
2159 }
2160 
2161 /**
2162  * __napi_schedule - schedule for receive
2163  * @n: entry to schedule
2164  *
2165  * The entry's receive function will be scheduled to run
2166  */
2167 void __napi_schedule(struct napi_struct *n)
2168 {
2169         unsigned long flags;
2170 
2171         local_irq_save(flags);
2172         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2173         raise_softirq_irqoff(NET_RX_SOFTIRQ);
2174         local_irq_restore(flags);
2175 }
2176 EXPORT_SYMBOL(__napi_schedule);
2177 
2178 
2179 static void net_rx_action(struct softirq_action *h)
2180 {
2181         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2182         unsigned long start_time = jiffies;
2183         int budget = netdev_budget;
2184         void *have;
2185 
2186         local_irq_disable();
2187 
2188         while (!list_empty(list)) {
2189                 struct napi_struct *n;
2190                 int work, weight;
2191 
2192                 /* If softirq window is exhuasted then punt.
2193                  *
2194                  * Note that this is a slight policy change from the
2195                  * previous NAPI code, which would allow up to 2
2196                  * jiffies to pass before breaking out.  The test
2197                  * used to be "jiffies - start_time > 1".
2198                  */
2199                 if (unlikely(budget <= 0 || jiffies != start_time))
2200                         goto softnet_break;
2201 
2202                 local_irq_enable();
2203 
2204                 /* Even though interrupts have been re-enabled, this
2205                  * access is safe because interrupts can only add new
2206                  * entries to the tail of this list, and only ->poll()
2207                  * calls can remove this head entry from the list.
2208                  */
2209                 n = list_entry(list->next, struct napi_struct, poll_list);
2210 
2211                 have = netpoll_poll_lock(n);
2212 
2213                 weight = n->weight;
2214 
2215                 /* This NAPI_STATE_SCHED test is for avoiding a race
2216                  * with netpoll's poll_napi().  Only the entity which
2217                  * obtains the lock and sees NAPI_STATE_SCHED set will
2218                  * actually make the ->poll() call.  Therefore we avoid
2219                  * accidently calling ->poll() when NAPI is not scheduled.
2220                  */
2221                 work = 0;
2222                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2223                         work = n->poll(n, weight);
2224 
2225                 WARN_ON_ONCE(work > weight);
2226 
2227                 budget -= work;
2228 
2229                 local_irq_disable();
2230 
2231                 /* Drivers must not modify the NAPI state if they
2232                  * consume the entire weight.  In such cases this code
2233                  * still "owns" the NAPI instance and therefore can
2234                  * move the instance around on the list at-will.
2235                  */
2236                 if (unlikely(work == weight)) {
2237                         if (unlikely(napi_disable_pending(n)))
2238                                 __napi_complete(n);
2239                         else
2240                                 list_move_tail(&n->poll_list, list);
2241                 }
2242 
2243                 netpoll_poll_unlock(have);
2244         }
2245 out:
2246         local_irq_enable();
2247 
2248 #ifdef CONFIG_NET_DMA
2249         /*
2250          * There may not be any more sk_buffs coming right now, so push
2251          * any pending DMA copies to hardware
2252          */
2253         if (!cpus_empty(net_dma.channel_mask)) {
2254                 int chan_idx;
2255                 for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2256                         struct dma_chan *chan = net_dma.channels[chan_idx];
2257                         if (chan)
2258                                 dma_async_memcpy_issue_pending(chan);
2259                 }
2260         }
2261 #endif
2262 
2263         return;
2264 
2265 softnet_break:
2266         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2267         raise_softirq_irqoff(NET_RX_SOFTIRQ);
2268         goto out;
2269 }
2270 
2271 static gifconf_func_t * gifconf_list [NPROTO];
2272 
2273 /**
2274  *      register_gifconf        -       register a SIOCGIF handler
2275  *      @family: Address family
2276  *      @gifconf: Function handler
2277  *
2278  *      Register protocol dependent address dumping routines. The handler
2279  *      that is passed must not be freed or reused until it has been replaced
2280  *      by another handler.
2281  */
2282 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2283 {
2284         if (family >= NPROTO)
2285                 return -EINVAL;
2286         gifconf_list[family] = gifconf;
2287         return 0;
2288 }
2289 
2290 
2291 /*
2292  *      Map an interface index to its name (SIOCGIFNAME)
2293  */
2294 
2295 /*
2296  *      We need this ioctl for efficient implementation of the
2297  *      if_indextoname() function required by the IPv6 API.  Without
2298  *      it, we would have to search all the interfaces to find a
2299  *      match.  --pb
2300  */
2301 
2302 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2303 {
2304         struct net_device *dev;
2305         struct ifreq ifr;
2306 
2307         /*
2308          *      Fetch the caller's info block.
2309          */
2310 
2311         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2312                 return -EFAULT;
2313 
2314         read_lock(&dev_base_lock);
2315         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2316         if (!dev) {
2317                 read_unlock(&dev_base_lock);
2318                 return -ENODEV;
2319         }
2320 
2321         strcpy(ifr.ifr_name, dev->name);
2322         read_unlock(&dev_base_lock);
2323 
2324         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2325                 return -EFAULT;
2326         return 0;
2327 }
2328 
2329 /*
2330  *      Perform a SIOCGIFCONF call. This structure will change
2331  *      size eventually, and there is nothing I can do about it.
2332  *      Thus we will need a 'compatibility mode'.
2333  */
2334 
2335 static int dev_ifconf(struct net *net, char __user *arg)
2336 {
2337         struct ifconf ifc;
2338         struct net_device *dev;
2339         char __user *pos;
2340         int len;
2341         int total;
2342         int i;
2343 
2344         /*
2345          *      Fetch the caller's info block.
2346          */
2347 
2348         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2349                 return -EFAULT;
2350 
2351         pos = ifc.ifc_buf;
2352         len = ifc.ifc_len;
2353 
2354         /*
2355          *      Loop over the interfaces, and write an info block for each.
2356          */
2357 
2358         total = 0;
2359         for_each_netdev(net, dev) {
2360                 for (i = 0; i < NPROTO; i++) {
2361                         if (gifconf_list[i]) {
2362                                 int done;
2363                                 if (!pos)
2364                                         done = gifconf_list[i](dev, NULL, 0);
2365                                 else
2366                                         done = gifconf_list[i](dev, pos + total,
2367                                                                len - total);
2368                                 if (done < 0)
2369                                         return -EFAULT;
2370                                 total += done;
2371                         }
2372                 }
2373         }
2374 
2375         /*
2376          *      All done.  Write the updated control block back to the caller.
2377          */
2378         ifc.ifc_len = total;
2379 
2380         /*
2381          *      Both BSD and Solaris return 0 here, so we do too.
2382          */
2383         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2384 }
2385 
2386 #ifdef CONFIG_PROC_FS
2387 /*
2388  *      This is invoked by the /proc filesystem handler to display a device
2389  *      in detail.
2390  */
2391 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2392         __acquires(dev_base_lock)
2393 {
2394         struct net *net = seq_file_net(seq);
2395         loff_t off;
2396         struct net_device *dev;
2397 
2398         read_lock(&dev_base_lock);
2399         if (!*pos)
2400                 return SEQ_START_TOKEN;
2401 
2402         off = 1;
2403         for_each_netdev(net, dev)
2404                 if (off++ == *pos)
2405                         return dev;
2406 
2407         return NULL;
2408 }
2409 
2410 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2411 {
2412         struct net *net = seq_file_net(seq);
2413         ++*pos;
2414         return v == SEQ_START_TOKEN ?
2415                 first_net_device(net) : next_net_device((struct net_device *)v);
2416 }
2417 
2418 void dev_seq_stop(struct seq_file *seq, void *v)
2419         __releases(dev_base_lock)
2420 {
2421         read_unlock(&dev_base_lock);
2422 }
2423 
2424 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2425 {
2426         struct net_device_stats *stats = dev->get_stats(dev);
2427 
2428         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2429                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2430                    dev->name, stats->rx_bytes, stats->rx_packets,
2431                    stats->rx_errors,
2432                    stats->rx_dropped + stats->rx_missed_errors,
2433                    stats->rx_fifo_errors,
2434                    stats->rx_length_errors + stats->rx_over_errors +
2435                     stats->rx_crc_errors + stats->rx_frame_errors,
2436                    stats->rx_compressed, stats->multicast,
2437                    stats->tx_bytes, stats->tx_packets,
2438                    stats->tx_errors, stats->tx_dropped,
2439                    stats->tx_fifo_errors, stats->collisions,
2440                    stats->tx_carrier_errors +
2441                     stats->tx_aborted_errors +
2442                     stats->tx_window_errors +
2443                     stats->tx_heartbeat_errors,
2444                    stats->tx_compressed);
2445 }
2446 
2447 /*
2448  *      Called from the PROCfs module. This now uses the new arbitrary sized
2449  *      /proc/net interface to create /proc/net/dev
2450  */
2451 static int dev_seq_show(struct seq_file *seq, void *v)
2452 {
2453         if (v == SEQ_START_TOKEN)
2454                 seq_puts(seq, "Inter-|   Receive                            "
2455                               "                    |  Transmit\n"
2456                               " face |bytes    packets errs drop fifo frame "
2457                               "compressed multicast|bytes    packets errs "
2458                               "drop fifo colls carrier compressed\n");
2459         else
2460                 dev_seq_printf_stats(seq, v);
2461         return 0;
2462 }
2463 
2464 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2465 {
2466         struct netif_rx_stats *rc = NULL;
2467 
2468         while (*pos < NR_CPUS)
2469                 if (cpu_online(*pos)) {
2470                         rc = &per_cpu(netdev_rx_stat, *pos);
2471                         break;
2472                 } else
2473                         ++*pos;
2474         return rc;
2475 }
2476 
2477 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2478 {
2479         return softnet_get_online(pos);
2480 }
2481 
2482 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2483 {
2484         ++*pos;
2485         return softnet_get_online(pos);
2486 }
2487 
2488 static void softnet_seq_stop(struct seq_file *seq, void *v)
2489 {
2490 }
2491 
2492 static int softnet_seq_show(struct seq_file *seq, void *v)
2493 {
2494         struct netif_rx_stats *s = v;
2495 
2496         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2497                    s->total, s->dropped, s->time_squeeze, 0,
2498                    0, 0, 0, 0, /* was fastroute */
2499                    s->cpu_collision );
2500         return 0;
2501 }
2502 
2503 static const struct seq_operations dev_seq_ops = {
2504         .start = dev_seq_start,
2505         .next  = dev_seq_next,
2506         .stop  = dev_seq_stop,
2507         .show  = dev_seq_show,
2508 };
2509 
2510 static int dev_seq_open(struct inode *inode, struct file *file)
2511 {
2512         return seq_open_net(inode, file, &dev_seq_ops,
2513                             sizeof(struct seq_net_private));
2514 }
2515 
2516 static const struct file_operations dev_seq_fops = {
2517         .owner   = THIS_MODULE,
2518         .open    = dev_seq_open,
2519         .read    = seq_read,
2520         .llseek  = seq_lseek,
2521         .release = seq_release_net,
2522 };
2523 
2524 static const struct seq_operations softnet_seq_ops = {
2525         .start = softnet_seq_start,
2526         .next  = softnet_seq_next,
2527         .stop  = softnet_seq_stop,
2528         .show  = softnet_seq_show,
2529 };
2530 
2531 static int softnet_seq_open(struct inode *inode, struct file *file)
2532 {
2533         return seq_open(file, &softnet_seq_ops);
2534 }
2535 
2536 static const struct file_operations softnet_seq_fops = {
2537         .owner   = THIS_MODULE,
2538         .open    = softnet_seq_open,
2539         .read    = seq_read,
2540         .llseek  = seq_lseek,
2541         .release = seq_release,
2542 };
2543 
2544 static void *ptype_get_idx(loff_t pos)
2545 {
2546         struct packet_type *pt = NULL;
2547         loff_t i = 0;
2548         int t;
2549 
2550         list_for_each_entry_rcu(pt, &ptype_all, list) {
2551                 if (i == pos)
2552                         return pt;
2553                 ++i;
2554         }
2555 
2556         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2557                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2558                         if (i == pos)
2559                                 return pt;
2560                         ++i;
2561                 }
2562         }
2563         return NULL;
2564 }
2565 
2566 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2567         __acquires(RCU)
2568 {
2569         rcu_read_lock();
2570         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2571 }
2572 
2573 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2574 {
2575         struct packet_type *pt;
2576         struct list_head *nxt;
2577         int hash;
2578 
2579         ++*pos;
2580         if (v == SEQ_START_TOKEN)
2581                 return ptype_get_idx(0);
2582 
2583         pt = v;
2584         nxt = pt->list.next;
2585         if (pt->type == htons(ETH_P_ALL)) {
2586                 if (nxt != &ptype_all)
2587                         goto found;
2588                 hash = 0;
2589                 nxt = ptype_base[0].next;
2590         } else
2591                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2592 
2593         while (nxt == &ptype_base[hash]) {
2594                 if (++hash >= PTYPE_HASH_SIZE)
2595                         return NULL;
2596                 nxt = ptype_base[hash].next;
2597         }
2598 found:
2599         return list_entry(nxt, struct packet_type, list);
2600 }
2601 
2602 static void ptype_seq_stop(struct seq_file *seq, void *v)
2603         __releases(RCU)
2604 {
2605         rcu_read_unlock();
2606 }
2607 
2608 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2609 {
2610 #ifdef CONFIG_KALLSYMS
2611         unsigned long offset = 0, symsize;
2612         const char *symname;
2613         char *modname;
2614         char namebuf[128];
2615 
2616         symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2617                                   &modname, namebuf);
2618 
2619         if (symname) {
2620                 char *delim = ":";
2621 
2622                 if (!modname)
2623                         modname = delim = "";
2624                 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2625                            symname, offset);
2626                 return;
2627         }
2628 #endif
2629 
2630         seq_printf(seq, "[%p]", sym);
2631 }
2632 
2633 static int ptype_seq_show(struct seq_file *seq, void *v)
2634 {
2635         struct packet_type *pt = v;
2636 
2637         if (v == SEQ_START_TOKEN)
2638                 seq_puts(seq, "Type Device      Function\n");
2639         else {
2640                 if (pt->type == htons(ETH_P_ALL))
2641                         seq_puts(seq, "ALL ");
2642                 else
2643                         seq_printf(seq, "%04x", ntohs(pt->type));
2644 
2645                 seq_printf(seq, " %-8s ",
2646                            pt->dev ? pt->dev->name : "");
2647                 ptype_seq_decode(seq,  pt->func);
2648                 seq_putc(seq, '\n');
2649         }
2650 
2651         return 0;
2652 }
2653 
2654 static const struct seq_operations ptype_seq_ops = {
2655         .start = ptype_seq_start,
2656         .next  = ptype_seq_next,
2657         .stop  = ptype_seq_stop,
2658         .show  = ptype_seq_show,
2659 };
2660 
2661 static int ptype_seq_open(struct inode *inode, struct file *file)
2662 {
2663         return seq_open(file, &ptype_seq_ops);
2664 }
2665 
2666 static const struct file_operations ptype_seq_fops = {
2667         .owner   = THIS_MODULE,
2668         .open    = ptype_seq_open,
2669         .read    = seq_read,
2670         .llseek  = seq_lseek,
2671         .release = seq_release,
2672 };
2673 
2674 
2675 static int __net_init dev_proc_net_init(struct net *net)
2676 {
2677         int rc = -ENOMEM;
2678 
2679         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2680                 goto out;
2681         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2682                 goto out_dev;
2683         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2684                 goto out_softnet;
2685 
2686         if (wext_proc_init(net))
2687                 goto out_ptype;
2688         rc = 0;
2689 out:
2690         return rc;
2691 out_ptype:
2692         proc_net_remove(net, "ptype");
2693 out_softnet:
2694         proc_net_remove(net, "softnet_stat");
2695 out_dev:
2696         proc_net_remove(net, "dev");
2697         goto out;
2698 }
2699 
2700 static void __net_exit dev_proc_net_exit(struct net *net)
2701 {
2702         wext_proc_exit(net);
2703 
2704         proc_net_remove(net, "ptype");
2705         proc_net_remove(net, "softnet_stat");
2706         proc_net_remove(net, "dev");
2707 }
2708 
2709 static struct pernet_operations __net_initdata dev_proc_ops = {
2710         .init = dev_proc_net_init,
2711         .exit = dev_proc_net_exit,
2712 };
2713 
2714 static int __init dev_proc_init(void)
2715 {
2716         return register_pernet_subsys(&dev_proc_ops);
2717 }
2718 #else
2719 #define dev_proc_init() 0
2720 #endif  /* CONFIG_PROC_FS */
2721 
2722 
2723 /**
2724  *      netdev_set_master       -       set up master/slave pair
2725  *      @slave: slave device
2726  *      @master: new master device
2727  *
2728  *      Changes the master device of the slave. Pass %NULL to break the
2729  *      bonding. The caller must hold the RTNL semaphore. On a failure
2730  *      a negative errno code is returned. On success the reference counts
2731  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2732  *      function returns zero.
2733  */
2734 int netdev_set_master(struct net_device *slave, struct net_device *master)
2735 {
2736         struct net_device *old = slave->master;
2737 
2738         ASSERT_RTNL();
2739 
2740         if (master) {
2741                 if (old)
2742                         return -EBUSY;
2743                 dev_hold(master);
2744         }
2745 
2746         slave->master = master;
2747 
2748         synchronize_net();
2749 
2750         if (old)
2751                 dev_put(old);
2752 
2753         if (master)
2754                 slave->flags |= IFF_SLAVE;
2755         else
2756                 slave->flags &= ~IFF_SLAVE;
2757 
2758         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2759         return 0;
2760 }
2761 
2762 static void __dev_set_promiscuity(struct net_device *dev, int inc)
2763 {
2764         unsigned short old_flags = dev->flags;
2765 
2766         ASSERT_RTNL();
2767 
2768         if ((dev->promiscuity += inc) == 0)
2769                 dev->flags &= ~IFF_PROMISC;
2770         else
2771                 dev->flags |= IFF_PROMISC;
2772         if (dev->flags != old_flags) {
2773                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2774                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2775                                                                "left");
2776                 if (audit_enabled)
2777                         audit_log(current->audit_context, GFP_ATOMIC,
2778                                 AUDIT_ANOM_PROMISCUOUS,
2779                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2780                                 dev->name, (dev->flags & IFF_PROMISC),
2781                                 (old_flags & IFF_PROMISC),
2782                                 audit_get_loginuid(current),
2783                                 current->uid, current->gid,
2784                                 audit_get_sessionid(current));
2785 
2786                 if (dev->change_rx_flags)
2787                         dev->change_rx_flags(dev, IFF_PROMISC);
2788         }
2789 }
2790 
2791 /**
2792  *      dev_set_promiscuity     - update promiscuity count on a device
2793  *      @dev: device
2794  *      @inc: modifier
2795  *
2796  *      Add or remove promiscuity from a device. While the count in the device
2797  *      remains above zero the interface remains promiscuous. Once it hits zero
2798  *      the device reverts back to normal filtering operation. A negative inc
2799  *      value is used to drop promiscuity on the device.
2800  */
2801 void dev_set_promiscuity(struct net_device *dev, int inc)
2802 {
2803         unsigned short old_flags = dev->flags;
2804 
2805         __dev_set_promiscuity(dev, inc);
2806         if (dev->flags != old_flags)
2807                 dev_set_rx_mode(dev);
2808 }
2809 
2810 /**
2811  *      dev_set_allmulti        - update allmulti count on a device
2812  *      @dev: device
2813  *      @inc: modifier
2814  *
2815  *      Add or remove reception of all multicast frames to a device. While the
2816  *      count in the device remains above zero the interface remains listening
2817  *      to all interfaces. Once it hits zero the device reverts back to normal
2818  *      filtering operation. A negative @inc value is used to drop the counter
2819  *      when releasing a resource needing all multicasts.
2820  */
2821 
2822 void dev_set_allmulti(struct net_device *dev, int inc)
2823 {
2824         unsigned short old_flags = dev->flags;
2825 
2826         ASSERT_RTNL();
2827 
2828         dev->flags |= IFF_ALLMULTI;
2829         if ((dev->allmulti += inc) == 0)
2830                 dev->flags &= ~IFF_ALLMULTI;
2831         if (dev->flags ^ old_flags) {
2832                 if (dev->change_rx_flags)
2833                         dev->change_rx_flags(dev, IFF_ALLMULTI);
2834                 dev_set_rx_mode(dev);
2835         }
2836 }
2837 
2838 /*
2839  *      Upload unicast and multicast address lists to device and
2840  *      configure RX filtering. When the device doesn't support unicast
2841  *      filtering it is put in promiscuous mode while unicast addresses
2842  *      are present.
2843  */
2844 void __dev_set_rx_mode(struct net_device *dev)
2845 {
2846         /* dev_open will call this function so the list will stay sane. */
2847         if (!(dev->flags&IFF_UP))
2848                 return;
2849 
2850         if (!netif_device_present(dev))
2851                 return;
2852 
2853         if (dev->set_rx_mode)
2854                 dev->set_rx_mode(dev);
2855         else {
2856                 /* Unicast addresses changes may only happen under the rtnl,
2857                  * therefore calling __dev_set_promiscuity here is safe.
2858                  */
2859                 if (dev->uc_count > 0 && !dev->uc_promisc) {
2860                         __dev_set_promiscuity(dev, 1);
2861                         dev->uc_promisc = 1;
2862                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
2863                         __dev_set_promiscuity(dev, -1);
2864                         dev->uc_promisc = 0;
2865                 }
2866 
2867                 if (dev->set_multicast_list)
2868                         dev->set_multicast_list(dev);
2869         }
2870 }
2871 
2872 void dev_set_rx_mode(struct net_device *dev)
2873 {
2874         netif_tx_lock_bh(dev);
2875         __dev_set_rx_mode(dev);
2876         netif_tx_unlock_bh(dev);
2877 }
2878 
2879 int __dev_addr_delete(struct dev_addr_list **list, int *count,
2880                       void *addr, int alen, int glbl)
2881 {
2882         struct dev_addr_list *da;
2883 
2884         for (; (da = *list) != NULL; list = &da->next) {
2885                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2886                     alen == da->da_addrlen) {
2887                         if (glbl) {
2888                                 int old_glbl = da->da_gusers;
2889                                 da->da_gusers = 0;
2890                                 if (old_glbl == 0)
2891                                         break;
2892                         }
2893                         if (--da->da_users)
2894                                 return 0;
2895 
2896                         *list = da->next;
2897                         kfree(da);
2898                         (*count)--;
2899                         return 0;
2900                 }
2901         }
2902         return -ENOENT;
2903 }
2904 
2905 int __dev_addr_add(struct dev_addr_list **list, int *count,
2906                    void *addr, int alen, int glbl)
2907 {
2908         struct dev_addr_list *da;
2909 
2910         for (da = *list; da != NULL; da = da->next) {
2911                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2912                     da->da_addrlen == alen) {
2913                         if (glbl) {
2914                                 int old_glbl = da->da_gusers;
2915                                 da->da_gusers = 1;
2916                                 if (old_glbl)
2917                                         return 0;
2918                         }
2919                         da->da_users++;
2920                         return 0;
2921                 }
2922         }
2923 
2924         da = kzalloc(sizeof(*da), GFP_ATOMIC);
2925         if (da == NULL)
2926                 return -ENOMEM;
2927         memcpy(da->da_addr, addr, alen);
2928         da->da_addrlen = alen;
2929         da->da_users = 1;
2930         da->da_gusers = glbl ? 1 : 0;
2931         da->next = *list;
2932         *list = da;
2933         (*count)++;
2934         return 0;
2935 }
2936 
2937 /**
2938  *      dev_unicast_delete      - Release secondary unicast address.
2939  *      @dev: device
2940  *      @addr: address to delete
2941  *      @alen: length of @addr
2942  *
2943  *      Release reference to a secondary unicast address and remove it
2944  *      from the device if the reference count drops to zero.
2945  *
2946  *      The caller must hold the rtnl_mutex.
2947  */
2948 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2949 {
2950         int err;
2951 
2952         ASSERT_RTNL();
2953 
2954         netif_tx_lock_bh(dev);
2955         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2956         if (!err)
2957                 __dev_set_rx_mode(dev);
2958         netif_tx_unlock_bh(dev);
2959         return err;
2960 }
2961 EXPORT_SYMBOL(dev_unicast_delete);
2962 
2963 /**
2964  *      dev_unicast_add         - add a secondary unicast address
2965  *      @dev: device
2966  *      @addr: address to delete
2967  *      @alen: length of @addr
2968  *
2969  *      Add a secondary unicast address to the device or increase
2970  *      the reference count if it already exists.
2971  *
2972  *      The caller must hold the rtnl_mutex.
2973  */
2974 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2975 {
2976         int err;
2977 
2978         ASSERT_RTNL();
2979 
2980         netif_tx_lock_bh(dev);
2981         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2982         if (!err)
2983                 __dev_set_rx_mode(dev);
2984         netif_tx_unlock_bh(dev);
2985         return err;
2986 }
2987 EXPORT_SYMBOL(dev_unicast_add);
2988 
2989 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
2990                     struct dev_addr_list **from, int *from_count)
2991 {
2992         struct dev_addr_list *da, *next;
2993         int err = 0;
2994 
2995         da = *from;
2996         while (da != NULL) {
2997                 next = da->next;
2998                 if (!da->da_synced) {
2999                         err = __dev_addr_add(to, to_count,
3000                                              da->da_addr, da->da_addrlen, 0);
3001                         if (err < 0)
3002                                 break;
3003                         da->da_synced = 1;
3004                         da->da_users++;
3005                 } else if (da->da_users == 1) {
3006                         __dev_addr_delete(to, to_count,
3007                                           da->da_addr, da->da_addrlen, 0);
3008                         __dev_addr_delete(from, from_count,
3009                                           da->da_addr, da->da_addrlen, 0);
3010                 }
3011                 da = next;
3012         }
3013         return err;
3014 }
3015 
3016 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3017                        struct dev_addr_list **from, int *from_count)
3018 {
3019         struct dev_addr_list *da, *next;
3020 
3021         da = *from;
3022         while (da != NULL) {
3023                 next = da->next;
3024                 if (da->da_synced) {
3025                         __dev_addr_delete(to, to_count,
3026                                           da->da_addr, da->da_addrlen, 0);
3027                         da->da_synced = 0;
3028                         __dev_addr_delete(from, from_count,
3029                                           da->da_addr, da->da_addrlen, 0);
3030                 }
3031                 da = next;
3032         }
3033 }
3034 
3035 /**
3036  *      dev_unicast_sync - Synchronize device's unicast list to another device
3037  *      @to: destination device
3038  *      @from: source device
3039  *
3040  *      Add newly added addresses to the destination device and release
3041  *      addresses that have no users left. The source device must be
3042  *      locked by netif_tx_lock_bh.
3043  *
3044  *      This function is intended to be called from the dev->set_rx_mode
3045  *      function of layered software devices.
3046  */
3047 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3048 {
3049         int err = 0;
3050 
3051         netif_tx_lock_bh(to);
3052         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3053                               &from->uc_list, &from->uc_count);
3054         if (!err)
3055                 __dev_set_rx_mode(to);
3056         netif_tx_unlock_bh(to);
3057         return err;
3058 }
3059 EXPORT_SYMBOL(dev_unicast_sync);
3060 
3061 /**
3062  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3063  *      @to: destination device
3064  *      @from: source device
3065  *
3066  *      Remove all addresses that were added to the destination device by
3067  *      dev_unicast_sync(). This function is intended to be called from the
3068  *      dev->stop function of layered software devices.
3069  */
3070 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3071 {
3072         netif_tx_lock_bh(from);
3073         netif_tx_lock_bh(to);
3074 
3075         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3076                           &from->uc_list, &from->uc_count);
3077         __dev_set_rx_mode(to);
3078 
3079         netif_tx_unlock_bh(to);
3080         netif_tx_unlock_bh(from);
3081 }
3082 EXPORT_SYMBOL(dev_unicast_unsync);
3083 
3084 static void __dev_addr_discard(struct dev_addr_list **list)
3085 {
3086         struct dev_addr_list *tmp;
3087 
3088         while (*list != NULL) {
3089                 tmp = *list;
3090                 *list = tmp->next;
3091                 if (tmp->da_users > tmp->da_gusers)
3092                         printk("__dev_addr_discard: address leakage! "
3093                                "da_users=%d\n", tmp->da_users);
3094                 kfree(tmp);
3095         }
3096 }
3097 
3098 static void dev_addr_discard(struct net_device *dev)
3099 {
3100         netif_tx_lock_bh(dev);
3101 
3102         __dev_addr_discard(&dev->uc_list);
3103         dev->uc_count = 0;
3104 
3105         __dev_addr_discard(&dev->mc_list);
3106         dev->mc_count = 0;
3107 
3108         netif_tx_unlock_bh(dev);
3109 }
3110 
3111 unsigned dev_get_flags(const struct net_device *dev)
3112 {
3113         unsigned flags;
3114 
3115         flags = (dev->flags & ~(IFF_PROMISC |
3116                                 IFF_ALLMULTI |
3117                                 IFF_RUNNING |
3118                                 IFF_LOWER_UP |
3119                                 IFF_DORMANT)) |
3120                 (dev->gflags & (IFF_PROMISC |
3121                                 IFF_ALLMULTI));
3122 
3123         if (netif_running(dev)) {
3124                 if (netif_oper_up(dev))
3125                         flags |= IFF_RUNNING;
3126                 if (netif_carrier_ok(dev))
3127                         flags |= IFF_LOWER_UP;
3128                 if (netif_dormant(dev))
3129                         flags |= IFF_DORMANT;
3130         }
3131 
3132         return flags;
3133 }
3134 
3135 int dev_change_flags(struct net_device *dev, unsigned flags)
3136 {
3137         int ret, changes;
3138         int old_flags = dev->flags;
3139 
3140         ASSERT_RTNL();
3141 
3142         /*
3143          *      Set the flags on our device.
3144          */
3145 
3146         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3147                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3148                                IFF_AUTOMEDIA)) |
3149                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3150                                     IFF_ALLMULTI));
3151 
3152         /*
3153          *      Load in the correct multicast list now the flags have changed.
3154          */
3155 
3156         if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
3157                 dev->change_rx_flags(dev, IFF_MULTICAST);
3158 
3159         dev_set_rx_mode(dev);
3160 
3161         /*
3162          *      Have we downed the interface. We handle IFF_UP ourselves
3163          *      according to user attempts to set it, rather than blindly
3164          *      setting it.
3165          */
3166 
3167         ret = 0;
3168         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3169                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3170 
3171                 if (!ret)
3172                         dev_set_rx_mode(dev);
3173         }
3174 
3175         if (dev->flags & IFF_UP &&
3176             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3177                                           IFF_VOLATILE)))
3178                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3179 
3180         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3181                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3182                 dev->gflags ^= IFF_PROMISC;
3183                 dev_set_promiscuity(dev, inc);
3184         }
3185 
3186         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3187            is important. Some (broken) drivers set IFF_PROMISC, when
3188            IFF_ALLMULTI is requested not asking us and not reporting.
3189          */
3190         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3191                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3192                 dev->gflags ^= IFF_ALLMULTI;
3193                 dev_set_allmulti(dev, inc);
3194         }
3195 
3196         /* Exclude state transition flags, already notified */
3197         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3198         if (changes)
3199                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3200 
3201         return ret;
3202 }
3203 
3204 int dev_set_mtu(struct net_device *dev, int new_mtu)
3205 {
3206         int err;
3207 
3208         if (new_mtu == dev->mtu)
3209                 return 0;
3210 
3211         /*      MTU must be positive.    */
3212         if (new_mtu < 0)
3213                 return -EINVAL;
3214 
3215         if (!netif_device_present(dev))
3216                 return -ENODEV;
3217 
3218         err = 0;
3219         if (dev->change_mtu)
3220                 err = dev->change_mtu(dev, new_mtu);
3221         else
3222                 dev->mtu = new_mtu;
3223         if (!err && dev->flags & IFF_UP)
3224                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3225         return err;
3226 }
3227 
3228 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3229 {
3230         int err;
3231 
3232         if (!dev->set_mac_address)
3233                 return -EOPNOTSUPP;
3234         if (sa->sa_family != dev->type)
3235                 return -EINVAL;
3236         if (!netif_device_present(dev))
3237                 return -ENODEV;
3238         err = dev->set_mac_address(dev, sa);
3239         if (!err)
3240                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3241         return err;
3242 }
3243 
3244 /*
3245  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3246  */
3247 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3248 {
3249         int err;
3250         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3251 
3252         if (!dev)
3253                 return -ENODEV;
3254 
3255         switch (cmd) {
3256                 case SIOCGIFFLAGS:      /* Get interface flags */
3257                         ifr->ifr_flags = dev_get_flags(dev);
3258                         return 0;
3259 
3260                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3261                                            (currently unused) */
3262                         ifr->ifr_metric = 0;
3263                         return 0;
3264 
3265                 case SIOCGIFMTU:        /* Get the MTU of a device */
3266                         ifr->ifr_mtu = dev->mtu;
3267                         return 0;
3268 
3269                 case SIOCGIFHWADDR:
3270                         if (!dev->addr_len)
3271                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3272                         else
3273                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3274                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3275                         ifr->ifr_hwaddr.sa_family = dev->type;
3276                         return 0;
3277 
3278                 case SIOCGIFSLAVE:
3279                         err = -EINVAL;
3280                         break;
3281 
3282                 case SIOCGIFMAP:
3283                         ifr->ifr_map.mem_start = dev->mem_start;
3284                         ifr->ifr_map.mem_end   = dev->mem_end;
3285                         ifr->ifr_map.base_addr = dev->base_addr;
3286                         ifr->ifr_map.irq       = dev->irq;
3287                         ifr->ifr_map.dma       = dev->dma;
3288                         ifr->ifr_map.port      = dev->if_port;
3289                         return 0;
3290 
3291                 case SIOCGIFINDEX:
3292                         ifr->ifr_ifindex = dev->ifindex;
3293                         return 0;
3294 
3295                 case SIOCGIFTXQLEN:
3296                         ifr->ifr_qlen = dev->tx_queue_len;
3297                         return 0;
3298 
3299                 default:
3300                         /* dev_ioctl() should ensure this case
3301                          * is never reached
3302                          */
3303                         WARN_ON(1);
3304                         err = -EINVAL;
3305                         break;
3306 
3307         }
3308         return err;
3309 }
3310 
3311 /*
3312  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3313  */
3314 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3315 {
3316         int err;
3317         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3318 
3319         if (!dev)
3320                 return -ENODEV;
3321 
3322         switch (cmd) {
3323                 case SIOCSIFFLAGS:      /* Set interface flags */
3324                         return dev_change_flags(dev, ifr->ifr_flags);
3325 
3326                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3327                                            (currently unused) */
3328                         return -EOPNOTSUPP;
3329 
3330                 case SIOCSIFMTU:        /* Set the MTU of a device */
3331                         return dev_set_mtu(dev, ifr->ifr_mtu);
3332 
3333                 case SIOCSIFHWADDR:
3334                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3335 
3336                 case SIOCSIFHWBROADCAST:
3337                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3338                                 return -EINVAL;
3339                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3340                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3341                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3342                         return 0;
3343 
3344                 case SIOCSIFMAP:
3345                         if (dev->set_config) {
3346                                 if (!netif_device_present(dev))
3347                                         return -ENODEV;
3348                                 return dev->set_config(dev, &ifr->ifr_map);
3349                         }
3350                         return -EOPNOTSUPP;
3351 
3352                 case SIOCADDMULTI:
3353                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3354                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3355                                 return -EINVAL;
3356                         if (!netif_device_present(dev))
3357                                 return -ENODEV;
3358                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3359                                           dev->addr_len, 1);
3360 
3361                 case SIOCDELMULTI:
3362                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3363                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3364                                 return -EINVAL;
3365                         if (!netif_device_present(dev))
3366                                 return -ENODEV;
3367                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3368                                              dev->addr_len, 1);
3369 
3370                 case SIOCSIFTXQLEN:
3371                         if (ifr->ifr_qlen < 0)
3372                                 return -EINVAL;
3373                         dev->tx_queue_len = ifr->ifr_qlen;
3374                         return 0;
3375 
3376                 case SIOCSIFNAME:
3377                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3378                         return dev_change_name(dev, ifr->ifr_newname);
3379 
3380                 /*
3381                  *      Unknown or private ioctl
3382                  */
3383 
3384                 default:
3385                         if ((cmd >= SIOCDEVPRIVATE &&
3386                             cmd <= SIOCDEVPRIVATE + 15) ||
3387                             cmd == SIOCBONDENSLAVE ||
3388                             cmd == SIOCBONDRELEASE ||
3389                             cmd == SIOCBONDSETHWADDR ||
3390                             cmd == SIOCBONDSLAVEINFOQUERY ||
3391                             cmd == SIOCBONDINFOQUERY ||
3392                             cmd == SIOCBONDCHANGEACTIVE ||
3393                             cmd == SIOCGMIIPHY ||
3394                             cmd == SIOCGMIIREG ||
3395                             cmd == SIOCSMIIREG ||
3396                             cmd == SIOCBRADDIF ||
3397                             cmd == SIOCBRDELIF ||
3398                             cmd == SIOCWANDEV) {
3399                                 err = -EOPNOTSUPP;
3400                                 if (dev->do_ioctl) {
3401                                         if (netif_device_present(dev))
3402                                                 err = dev->do_ioctl(dev, ifr,
3403                                                                     cmd);
3404                                         else
3405                                                 err = -ENODEV;
3406                                 }
3407                         } else
3408                                 err = -EINVAL;
3409 
3410         }
3411         return err;
3412 }
3413 
3414 /*
3415  *      This function handles all "interface"-type I/O control requests. The actual
3416  *      'doing' part of this is dev_ifsioc above.
3417  */
3418 
3419 /**
3420  *      dev_ioctl       -       network device ioctl
3421  *      @net: the applicable net namespace
3422  *      @cmd: command to issue
3423  *      @arg: pointer to a struct ifreq in user space
3424  *
3425  *      Issue ioctl functions to devices. This is normally called by the
3426  *      user space syscall interfaces but can sometimes be useful for
3427  *      other purposes. The return value is the return from the syscall if
3428  *      positive or a negative errno code on error.
3429  */
3430 
3431 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3432 {
3433         struct ifreq ifr;
3434         int ret;
3435         char *colon;
3436 
3437         /* One special case: SIOCGIFCONF takes ifconf argument
3438            and requires shared lock, because it sleeps writing
3439            to user space.
3440          */
3441 
3442         if (cmd == SIOCGIFCONF) {
3443                 rtnl_lock();
3444                 ret = dev_ifconf(net, (char __user *) arg);
3445                 rtnl_unlock();
3446                 return ret;
3447         }
3448         if (cmd == SIOCGIFNAME)
3449                 return dev_ifname(net, (struct ifreq __user *)arg);
3450 
3451         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3452                 return -EFAULT;
3453 
3454         ifr.ifr_name[IFNAMSIZ-1] = 0;
3455 
3456         colon = strchr(ifr.ifr_name, ':');
3457         if (colon)
3458                 *colon = 0;
3459 
3460         /*
3461          *      See which interface the caller is talking about.
3462          */
3463 
3464         switch (cmd) {
3465                 /*
3466                  *      These ioctl calls:
3467                  *      - can be done by all.
3468                  *      - atomic and do not require locking.
3469                  *      - return a value
3470                  */
3471                 case SIOCGIFFLAGS:
3472                 case SIOCGIFMETRIC:
3473                 case SIOCGIFMTU:
3474                 case SIOCGIFHWADDR:
3475                 case SIOCGIFSLAVE:
3476                 case SIOCGIFMAP:
3477                 case SIOCGIFINDEX:
3478                 case SIOCGIFTXQLEN:
3479                         dev_load(net, ifr.ifr_name);
3480                         read_lock(&dev_base_lock);
3481                         ret = dev_ifsioc_locked(net, &ifr, cmd);
3482                         read_unlock(&dev_base_lock);
3483                         if (!ret) {
3484                                 if (colon)
3485                                         *colon = ':';
3486                                 if (copy_to_user(arg, &ifr,
3487                                                  sizeof(struct ifreq)))
3488                                         ret = -EFAULT;
3489                         }
3490                         return ret;
3491 
3492                 case SIOCETHTOOL:
3493                         dev_load(net, ifr.ifr_name);
3494                         rtnl_lock();
3495                         ret = dev_ethtool(net, &ifr);
3496                         rtnl_unlock();
3497                         if (!ret) {
3498                                 if (colon)
3499                                         *colon = ':';
3500                                 if (copy_to_user(arg, &ifr,
3501                                                  sizeof(struct ifreq)))
3502                                         ret = -EFAULT;
3503                         }
3504                         return ret;
3505 
3506                 /*
3507                  *      These ioctl calls:
3508                  *      - require superuser power.
3509                  *      - require strict serialization.
3510                  *      - return a value
3511                  */
3512                 case SIOCGMIIPHY:
3513                 case SIOCGMIIREG:
3514                 case SIOCSIFNAME:
3515                         if (!capable(CAP_NET_ADMIN))
3516                                 return -EPERM;
3517                         dev_load(net, ifr.ifr_name);
3518                         rtnl_lock();
3519                         ret = dev_ifsioc(net, &ifr, cmd);
3520                         rtnl_unlock();
3521                         if (!ret) {
3522                                 if (colon)
3523                                         *colon = ':';
3524                                 if (copy_to_user(arg, &ifr,
3525                                                  sizeof(struct ifreq)))
3526                                         ret = -EFAULT;
3527                         }
3528                         return ret;
3529 
3530                 /*
3531                  *      These ioctl calls:
3532                  *      - require superuser power.
3533                  *      - require strict serialization.
3534                  *      - do not return a value
3535                  */
3536                 case SIOCSIFFLAGS:
3537                 case SIOCSIFMETRIC:
3538                 case SIOCSIFMTU:
3539                 case SIOCSIFMAP:
3540                 case SIOCSIFHWADDR:
3541                 case SIOCSIFSLAVE:
3542                 case SIOCADDMULTI:
3543                 case SIOCDELMULTI:
3544                 case SIOCSIFHWBROADCAST:
3545                 case SIOCSIFTXQLEN:
3546                 case SIOCSMIIREG:
3547                 case SIOCBONDENSLAVE:
3548                 case SIOCBONDRELEASE:
3549                 case SIOCBONDSETHWADDR:
3550                 case SIOCBONDCHANGEACTIVE:
3551                 case SIOCBRADDIF:
3552                 case SIOCBRDELIF:
3553                         if (!capable(CAP_NET_ADMIN))
3554                                 return -EPERM;
3555                         /* fall through */
3556                 case SIOCBONDSLAVEINFOQUERY:
3557                 case SIOCBONDINFOQUERY:
3558                         dev_load(net, ifr.ifr_name);
3559                         rtnl_lock();
3560                         ret = dev_ifsioc(net, &ifr, cmd);
3561                         rtnl_unlock();
3562                         return ret;
3563 
3564                 case SIOCGIFMEM:
3565                         /* Get the per device memory space. We can add this but
3566                          * currently do not support it */
3567                 case SIOCSIFMEM:
3568                         /* Set the per device memory buffer space.
3569                          * Not applicable in our case */
3570                 case SIOCSIFLINK:
3571                         return -EINVAL;
3572 
3573                 /*
3574                  *      Unknown or private ioctl.
3575                  */
3576                 default:
3577                         if (cmd == SIOCWANDEV ||
3578                             (cmd >= SIOCDEVPRIVATE &&
3579                              cmd <= SIOCDEVPRIVATE + 15)) {
3580                                 dev_load(net, ifr.ifr_name);
3581                                 rtnl_lock();
3582                                 ret = dev_ifsioc(net, &ifr, cmd);
3583                                 rtnl_unlock();
3584                                 if (!ret && copy_to_user(arg, &ifr,
3585                                                          sizeof(struct ifreq)))
3586                                         ret = -EFAULT;
3587                                 return ret;
3588                         }
3589                         /* Take care of Wireless Extensions */
3590                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3591                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
3592                         return -EINVAL;
3593         }
3594 }
3595 
3596 
3597 /**
3598  *      dev_new_index   -       allocate an ifindex
3599  *      @net: the applicable net namespace
3600  *
3601  *      Returns a suitable unique value for a new device interface
3602  *      number.  The caller must hold the rtnl semaphore or the
3603  *      dev_base_lock to be sure it remains unique.
3604  */
3605 static int dev_new_index(struct net *net)
3606 {
3607         static int ifindex;
3608         for (;;) {
3609                 if (++ifindex <= 0)
3610                         ifindex = 1;
3611                 if (!__dev_get_by_index(net, ifindex))
3612                         return ifindex;
3613         }
3614 }
3615 
3616 /* Delayed registration/unregisteration */
3617 static DEFINE_SPINLOCK(net_todo_list_lock);
3618 static LIST_HEAD(net_todo_list);
3619 
3620 static void net_set_todo(struct net_device *dev)
3621 {
3622         spin_lock(&net_todo_list_lock);
3623         list_add_tail(&dev->todo_list, &net_todo_list);
3624         spin_unlock(&net_todo_list_lock);
3625 }
3626 
3627 static void rollback_registered(struct net_device *dev)
3628 {
3629         BUG_ON(dev_boot_phase);
3630         ASSERT_RTNL();
3631 
3632         /* Some devices call without registering for initialization unwind. */
3633         if (dev->reg_state == NETREG_UNINITIALIZED) {
3634                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3635                                   "was registered\n", dev->name, dev);
3636 
3637                 WARN_ON(1);
3638                 return;
3639         }
3640 
3641         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3642 
3643         /* If device is running, close it first. */
3644         dev_close(dev);
3645 
3646         /* And unlink it from device chain. */
3647         unlist_netdevice(dev);
3648 
3649         dev->reg_state = NETREG_UNREGISTERING;
3650 
3651         synchronize_net();
3652 
3653         /* Shutdown queueing discipline. */
3654         dev_shutdown(dev);
3655 
3656 
3657         /* Notify protocols, that we are about to destroy
3658            this device. They should clean all the things.
3659         */
3660         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3661 
3662         /*
3663          *      Flush the unicast and multicast chains
3664          */
3665         dev_addr_discard(dev);
3666 
3667         if (dev->uninit)
3668                 dev->uninit(dev);
3669 
3670         /* Notifier chain MUST detach us from master device. */
3671         BUG_TRAP(!dev->master);
3672 
3673         /* Remove entries from kobject tree */
3674         netdev_unregister_kobject(dev);
3675 
3676         synchronize_net();
3677 
3678         dev_put(dev);
3679 }
3680 
3681 /**
3682  *      register_netdevice      - register a network device
3683  *      @dev: device to register
3684  *
3685  *      Take a completed network device structure and add it to the kernel
3686  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3687  *      chain. 0 is returned on success. A negative errno code is returned
3688  *      on a failure to set up the device, or if the name is a duplicate.
3689  *
3690  *      Callers must hold the rtnl semaphore. You may want
3691  *      register_netdev() instead of this.
3692  *
3693  *      BUGS:
3694  *      The locking appears insufficient to guarantee two parallel registers
3695  *      will not get the same name.
3696  */
3697 
3698 int register_netdevice(struct net_device *dev)
3699 {
3700         struct hlist_head *head;
3701         struct hlist_node *p;
3702         int ret;
3703         struct net *net;
3704 
3705         BUG_ON(dev_boot_phase);
3706         ASSERT_RTNL();
3707 
3708         might_sleep();
3709 
3710         /* When net_device's are persistent, this will be fatal. */
3711         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3712         BUG_ON(!dev->nd_net);
3713         net = dev->nd_net;
3714 
3715         spin_lock_init(&dev->queue_lock);
3716         spin_lock_init(&dev->_xmit_lock);
3717         netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3718         dev->xmit_lock_owner = (void *)-1;
3719         spin_lock_init(&dev->ingress_lock);
3720 
3721         dev->iflink = -1;
3722 
3723         /* Init, if this function is available */
3724         if (dev->init) {
3725                 ret = dev->init(dev);
3726                 if (ret) {
3727                         if (ret > 0)
3728                                 ret = -EIO;
3729                         goto out;
3730                 }
3731         }
3732 
3733         if (!dev_valid_name(dev->name)) {
3734                 ret = -EINVAL;
3735                 goto err_uninit;
3736         }
3737 
3738         dev->ifindex = dev_new_index(net);
3739         if (dev->iflink == -1)
3740                 dev->iflink = dev->ifindex;
3741 
3742         /* Check for existence of name */
3743         head = dev_name_hash(net, dev->name);
3744         hlist_for_each(p, head) {
3745                 struct net_device *d
3746                         = hlist_entry(p, struct net_device, name_hlist);
3747                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3748                         ret = -EEXIST;
3749                         goto err_uninit;
3750                 }
3751         }
3752 
3753         /* Fix illegal checksum combinations */
3754         if ((dev->features & NETIF_F_HW_CSUM) &&
3755             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3756                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3757                        dev->name);
3758                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3759         }
3760 
3761         if ((dev->features & NETIF_F_NO_CSUM) &&
3762             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3763                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3764                        dev->name);
3765                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3766         }
3767 
3768 
3769         /* Fix illegal SG+CSUM combinations. */
3770         if ((dev->features & NETIF_F_SG) &&
3771             !(dev->features & NETIF_F_ALL_CSUM)) {
3772                 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3773                        dev->name);
3774                 dev->features &= ~NETIF_F_SG;
3775         }
3776 
3777         /* TSO requires that SG is present as well. */
3778         if ((dev->features & NETIF_F_TSO) &&
3779             !(dev->features & NETIF_F_SG)) {
3780                 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3781                        dev->name);
3782                 dev->features &= ~NETIF_F_TSO;
3783         }
3784         if (dev->features & NETIF_F_UFO) {
3785                 if (!(dev->features & NETIF_F_HW_CSUM)) {
3786                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3787                                         "NETIF_F_HW_CSUM feature.\n",
3788                                                         dev->name);
3789                         dev->features &= ~NETIF_F_UFO;
3790                 }
3791                 if (!(dev->features & NETIF_F_SG)) {
3792                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3793                                         "NETIF_F_SG feature.\n",
3794                                         dev->name);
3795                         dev->features &= ~NETIF_F_UFO;
3796                 }
3797         }
3798 
3799         ret = netdev_register_kobject(dev);
3800         if (ret)
3801                 goto err_uninit;
3802         dev->reg_state = NETREG_REGISTERED;
3803 
3804         /*
3805          *      Default initial state at registry is that the
3806          *      device is present.
3807          */
3808 
3809         set_bit(__LINK_STATE_PRESENT, &dev->state);
3810 
3811         dev_init_scheduler(dev);
3812         dev_hold(dev);
3813         list_netdevice(dev);
3814 
3815         /* Notify protocols, that a new device appeared. */
3816         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3817         ret = notifier_to_errno(ret);
3818         if (ret) {
3819                 rollback_registered(dev);
3820                 dev->reg_state = NETREG_UNREGISTERED;
3821         }
3822 
3823 out:
3824         return ret;
3825 
3826 err_uninit:
3827         if (dev->uninit)
3828                 dev->uninit(dev);
3829         goto out;
3830 }
3831 
3832 /**
3833  *      register_netdev - register a network device
3834  *      @dev: device to register
3835  *
3836  *      Take a completed network device structure and add it to the kernel
3837  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3838  *      chain. 0 is returned on success. A negative errno code is returned
3839  *      on a failure to set up the device, or if the name is a duplicate.
3840  *
3841  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
3842  *      and expands the device name if you passed a format string to
3843  *      alloc_netdev.
3844  */
3845 int register_netdev(struct net_device *dev)
3846 {
3847         int err;
3848 
3849         rtnl_lock();
3850 
3851         /*
3852          * If the name is a format string the caller wants us to do a
3853          * name allocation.
3854          */
3855         if (strchr(dev->name, '%')) {
3856                 err = dev_alloc_name(dev, dev->name);
3857                 if (err < 0)
3858                         goto out;
3859         }
3860 
3861         err = register_netdevice(dev);
3862 out:
3863         rtnl_unlock();
3864         return err;
3865 }
3866 EXPORT_SYMBOL(register_netdev);
3867 
3868 /*
3869  * netdev_wait_allrefs - wait until all references are gone.
3870  *
3871  * This is called when unregistering network devices.
3872  *
3873  * Any protocol or device that holds a reference should register
3874  * for netdevice notification, and cleanup and put back the
3875  * reference if they receive an UNREGISTER event.
3876  * We can get stuck here if buggy protocols don't correctly
3877  * call dev_put.
3878  */
3879 static void netdev_wait_allrefs(struct net_device *dev)
3880 {
3881         unsigned long rebroadcast_time, warning_time;
3882 
3883         rebroadcast_time = warning_time = jiffies;
3884         while (atomic_read(&dev->refcnt) != 0) {
3885                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3886                         rtnl_lock();
3887 
3888                         /* Rebroadcast unregister notification */
3889                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3890 
3891                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3892                                      &dev->state)) {
3893                                 /* We must not have linkwatch events
3894                                  * pending on unregister. If this
3895                                  * happens, we simply run the queue
3896                                  * unscheduled, resulting in a noop
3897                                  * for this device.
3898                                  */
3899                                 linkwatch_run_queue();
3900                         }
3901 
3902                         __rtnl_unlock();
3903 
3904                         rebroadcast_time = jiffies;
3905                 }
3906 
3907                 msleep(250);
3908 
3909                 if (time_after(jiffies, warning_time + 10 * HZ)) {
3910                         printk(KERN_EMERG "unregister_netdevice: "
3911                                "waiting for %s to become free. Usage "
3912                                "count = %d\n",
3913                                dev->name, atomic_read(&dev->refcnt));
3914                         warning_time = jiffies;
3915                 }
3916         }
3917 }
3918 
3919 /* The sequence is:
3920  *
3921  *      rtnl_lock();
3922  *      ...
3923  *      register_netdevice(x1);
3924  *      register_netdevice(x2);
3925  *      ...
3926  *      unregister_netdevice(y1);
3927  *      unregister_netdevice(y2);
3928  *      ...
3929  *      rtnl_unlock();
3930  *      free_netdev(y1);
3931  *      free_netdev(y2);
3932  *
3933  * We are invoked by rtnl_unlock() after it drops the semaphore.
3934  * This allows us to deal with problems:
3935  * 1) We can delete sysfs objects which invoke hotplug
3936  *    without deadlocking with linkwatch via keventd.
3937  * 2) Since we run with the RTNL semaphore not held, we can sleep
3938  *    safely in order to wait for the netdev refcnt to drop to zero.
3939  */
3940 static DEFINE_MUTEX(net_todo_run_mutex);
3941 void netdev_run_todo(void)
3942 {
3943         struct list_head list;
3944 
3945         /* Need to guard against multiple cpu's getting out of order. */
3946         mutex_lock(&net_todo_run_mutex);
3947 
3948         /* Not safe to do outside the semaphore.  We must not return
3949          * until all unregister events invoked by the local processor
3950          * have been completed (either by this todo run, or one on
3951          * another cpu).
3952          */
3953         if (list_empty(&net_todo_list))
3954                 goto out;
3955 
3956         /* Snapshot list, allow later requests */
3957         spin_lock(&net_todo_list_lock);
3958         list_replace_init(&net_todo_list, &list);
3959         spin_unlock(&net_todo_list_lock);
3960 
3961         while (!list_empty(&list)) {
3962                 struct net_device *dev
3963                         = list_entry(list.next, struct net_device, todo_list);
3964                 list_del(&dev->todo_list);
3965 
3966                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3967                         printk(KERN_ERR "network todo '%s' but state %d\n",
3968                                dev->name, dev->reg_state);
3969                         dump_stack();
3970                         continue;
3971                 }
3972 
3973                 dev->reg_state = NETREG_UNREGISTERED;
3974 
3975                 netdev_wait_allrefs(dev);
3976 
3977                 /* paranoia */
3978                 BUG_ON(atomic_read(&dev->refcnt));
3979                 BUG_TRAP(!dev->ip_ptr);
3980                 BUG_TRAP(!dev->ip6_ptr);
3981                 BUG_TRAP(!dev->dn_ptr);
3982 
3983                 if (dev->destructor)
3984                         dev->destructor(dev);
3985 
3986                 /* Free network device */
3987                 kobject_put(&dev->dev.kobj);
3988         }
3989 
3990 out:
3991         mutex_unlock(&net_todo_run_mutex);
3992 }
3993 
3994 static struct net_device_stats *internal_stats(struct net_device *dev)
3995 {
3996         return &dev->stats;
3997 }
3998 
3999 /**
4000  *      alloc_netdev_mq - allocate network device
4001  *      @sizeof_priv:   size of private data to allocate space for
4002  *      @name:          device name format string
4003  *      @setup:         callback to initialize device
4004  *      @queue_count:   the number of subqueues to allocate
4005  *
4006  *      Allocates a struct net_device with private data area for driver use
4007  *      and performs basic initialization.  Also allocates subquue structs
4008  *      for each queue on the device at the end of the netdevice.
4009  */
4010 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4011                 void (*setup)(struct net_device *), unsigned int queue_count)
4012 {
4013         void *p;
4014         struct net_device *dev;
4015         int alloc_size;
4016 
4017         BUG_ON(strlen(name) >= sizeof(dev->name));
4018 
4019         /* ensure 32-byte alignment of both the device and private area */
4020         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
4021                      (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
4022                      ~NETDEV_ALIGN_CONST;
4023         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
4024 
4025         p = kzalloc(alloc_size, GFP_KERNEL);
4026         if (!p) {
4027                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4028                 return NULL;
4029         }
4030 
4031         dev = (struct net_device *)
4032                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4033         dev->padded = (char *)dev - (char *)p;
4034         dev->nd_net = &init_net;
4035 
4036         if (sizeof_priv) {
4037                 dev->priv = ((char *)dev +
4038                              ((sizeof(struct net_device) +
4039                                (sizeof(struct net_device_subqueue) *
4040                                 (queue_count - 1)) + NETDEV_ALIGN_CONST)
4041                               & ~NETDEV_ALIGN_CONST));
4042         }
4043 
4044         dev->egress_subqueue_count = queue_count;
4045 
4046         dev->get_stats = internal_stats;
4047         netpoll_netdev_init(dev);
4048         setup(dev);
4049         strcpy(dev->name, name);
4050         return dev;
4051 }
4052 EXPORT_SYMBOL(alloc_netdev_mq);
4053 
4054 /**
4055  *      free_netdev - free network device
4056  *      @dev: device
4057  *
4058  *      This function does the last stage of destroying an allocated device
4059  *      interface. The reference to the device object is released.
4060  *      If this is the last reference then it will be freed.
4061  */
4062 void free_netdev(struct net_device *dev)
4063 {
4064         /*  Compatibility with error handling in drivers */
4065         if (dev->reg_state == NETREG_UNINITIALIZED) {
4066                 kfree((char *)dev - dev->padded);
4067                 return;
4068         }
4069 
4070         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4071         dev->reg_state = NETREG_RELEASED;
4072 
4073         /* will free via device release */
4074         put_device(&dev->dev);
4075 }
4076 
4077 /* Synchronize with packet receive processing. */
4078 void synchronize_net(void)
4079 {
4080         might_sleep();
4081         synchronize_rcu();
4082 }
4083 
4084 /**
4085  *      unregister_netdevice - remove device from the kernel
4086  *      @dev: device
4087  *
4088  *      This function shuts down a device interface and removes it
4089  *      from the kernel tables.
4090  *
4091  *      Callers must hold the rtnl semaphore.  You may want
4092  *      unregister_netdev() instead of this.
4093  */
4094 
4095 void unregister_netdevice(struct net_device *dev)
4096 {
4097         ASSERT_RTNL();
4098 
4099         rollback_registered(dev);
4100         /* Finish processing unregister after unlock */
4101         net_set_todo(dev);
4102 }
4103 
4104 /**
4105  *      unregister_netdev - remove device from the kernel
4106  *      @dev: device
4107  *
4108  *      This function shuts down a device interface and removes it
4109  *      from the kernel tables.
4110  *
4111  *      This is just a wrapper for unregister_netdevice that takes
4112  *      the rtnl semaphore.  In general you want to use this and not
4113  *      unregister_netdevice.
4114  */
4115 void unregister_netdev(struct net_device *dev)
4116 {
4117         rtnl_lock();
4118         unregister_netdevice(dev);
4119         rtnl_unlock();
4120 }
4121 
4122 EXPORT_SYMBOL(unregister_netdev);
4123 
4124 /**
4125  *      dev_change_net_namespace - move device to different nethost namespace
4126  *      @dev: device
4127  *      @net: network namespace
4128  *      @pat: If not NULL name pattern to try if the current device name
4129  *            is already taken in the destination network namespace.
4130  *
4131  *      This function shuts down a device interface and moves it
4132  *      to a new network namespace. On success 0 is returned, on
4133  *      a failure a netagive errno code is returned.
4134  *
4135  *      Callers must hold the rtnl semaphore.
4136  */
4137 
4138 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4139 {
4140         char buf[IFNAMSIZ];
4141         const char *destname;
4142         int err;
4143 
4144         ASSERT_RTNL();
4145 
4146         /* Don't allow namespace local devices to be moved. */
4147         err = -EINVAL;
4148         if (dev->features & NETIF_F_NETNS_LOCAL)
4149                 goto out;
4150 
4151         /* Ensure the device has been registrered */
4152         err = -EINVAL;
4153         if (dev->reg_state != NETREG_REGISTERED)
4154                 goto out;
4155 
4156         /* Get out if there is nothing todo */
4157         err = 0;
4158         if (dev->nd_net == net)
4159                 goto out;
4160 
4161         /* Pick the destination device name, and ensure
4162          * we can use it in the destination network namespace.
4163          */
4164         err = -EEXIST;
4165         destname = dev->name;
4166         if (__dev_get_by_name(net, destname)) {
4167                 /* We get here if we can't use the current device name */
4168                 if (!pat)
4169                         goto out;
4170                 if (!dev_valid_name(pat))
4171                         goto out;
4172                 if (strchr(pat, '%')) {
4173                         if (__dev_alloc_name(net, pat, buf) < 0)
4174                                 goto out;
4175                         destname = buf;
4176                 } else
4177                         destname = pat;
4178                 if (__dev_get_by_name(net, destname))
4179                         goto out;
4180         }
4181 
4182         /*
4183          * And now a mini version of register_netdevice unregister_netdevice.
4184          */
4185 
4186         /* If device is running close it first. */
4187         dev_close(dev);
4188 
4189         /* And unlink it from device chain */
4190         err = -ENODEV;
4191         unlist_netdevice(dev);
4192 
4193         synchronize_net();
4194 
4195         /* Shutdown queueing discipline. */
4196         dev_shutdown(dev);
4197 
4198         /* Notify protocols, that we are about to destroy
4199            this device. They should clean all the things.
4200         */
4201         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4202 
4203         /*
4204          *      Flush the unicast and multicast chains
4205          */
4206         dev_addr_discard(dev);
4207 
4208         /* Actually switch the network namespace */
4209         dev->nd_net = net;
4210 
4211         /* Assign the new device name */
4212         if (destname != dev->name)
4213                 strcpy(dev->name, destname);
4214 
4215         /* If there is an ifindex conflict assign a new one */
4216         if (__dev_get_by_index(net, dev->ifindex)) {
4217                 int iflink = (dev->iflink == dev->ifindex);
4218                 dev->ifindex = dev_new_index(net);
4219                 if (iflink)
4220                         dev->iflink = dev->ifindex;
4221         }
4222 
4223         /* Fixup kobjects */
4224         err = device_rename(&dev->dev, dev->name);
4225         WARN_ON(err);
4226 
4227         /* Add the device back in the hashes */
4228         list_netdevice(dev);
4229 
4230         /* Notify protocols, that a new device appeared. */
4231         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4232 
4233         synchronize_net();
4234         err = 0;
4235 out:
4236         return err;
4237 }
4238 
4239 static int dev_cpu_callback(struct notifier_block *nfb,
4240                             unsigned long action,
4241                             void *ocpu)
4242 {
4243         struct sk_buff **list_skb;
4244         struct net_device **list_net;
4245         struct sk_buff *skb;
4246         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4247         struct softnet_data *sd, *oldsd;
4248 
4249         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4250                 return NOTIFY_OK;
4251 
4252         local_irq_disable();
4253         cpu = smp_processor_id();
4254         sd = &per_cpu(softnet_data, cpu);
4255         oldsd = &per_cpu(softnet_data, oldcpu);
4256 
4257         /* Find end of our completion_queue. */
4258         list_skb = &sd->completion_queue;
4259         while (*list_skb)
4260                 list_skb = &(*list_skb)->next;
4261         /* Append completion queue from offline CPU. */
4262         *list_skb = oldsd->completion_queue;
4263         oldsd->completion_queue = NULL;
4264 
4265         /* Find end of our output_queue. */
4266         list_net = &sd->output_queue;
4267         while (*list_net)
4268                 list_net = &(*list_net)->next_sched;
4269         /* Append output queue from offline CPU. */
4270         *list_net = oldsd->output_queue;
4271         oldsd->output_queue = NULL;
4272 
4273         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4274         local_irq_enable();
4275 
4276         /* Process offline CPU's input_pkt_queue */
4277         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4278                 netif_rx(skb);
4279 
4280         return NOTIFY_OK;
4281 }
4282 
4283 #ifdef CONFIG_NET_DMA
4284 /**
4285  * net_dma_rebalance - try to maintain one DMA channel per CPU
4286  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4287  *
4288  * This is called when the number of channels allocated to the net_dma client
4289  * changes.  The net_dma client tries to have one DMA channel per CPU.
4290  */
4291 
4292 static void net_dma_rebalance(struct net_dma *net_dma)
4293 {
4294         unsigned int cpu, i, n, chan_idx;
4295         struct dma_chan *chan;
4296 
4297         if (cpus_empty(net_dma->channel_mask)) {
4298                 for_each_online_cpu(cpu)
4299                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4300                 return;
4301         }
4302 
4303         i = 0;
4304         cpu = first_cpu(cpu_online_map);
4305 
4306         for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4307                 chan = net_dma->channels[chan_idx];
4308 
4309                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4310                    + (i < (num_online_cpus() %
4311                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4312 
4313                 while(n) {
4314                         per_cpu(softnet_data, cpu).net_dma = chan;
4315                         cpu = next_cpu(cpu, cpu_online_map);
4316                         n--;
4317                 }
4318                 i++;
4319         }
4320 }
4321 
4322 /**
4323  * netdev_dma_event - event callback for the net_dma_client
4324  * @client: should always be net_dma_client
4325  * @chan: DMA channel for the event
4326  * @state: DMA state to be handled
4327  */
4328 static enum dma_state_client
4329 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4330         enum dma_state state)
4331 {
4332         int i, found = 0, pos = -1;
4333         struct net_dma *net_dma =
4334                 container_of(client, struct net_dma, client);
4335         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4336 
4337         spin_lock(&net_dma->lock);
4338         switch (state) {
4339         case DMA_RESOURCE_AVAILABLE:
4340                 for (i = 0; i < NR_CPUS; i++)
4341                         if (net_dma->channels[i] == chan) {
4342                                 found = 1;
4343                                 break;
4344                         } else if (net_dma->channels[i] == NULL && pos < 0)
4345                                 pos = i;
4346 
4347                 if (!found && pos >= 0) {
4348                         ack = DMA_ACK;
4349                         net_dma->channels[pos] = chan;
4350                         cpu_set(pos, net_dma->channel_mask);
4351                         net_dma_rebalance(net_dma);
4352                 }
4353                 break;
4354         case DMA_RESOURCE_REMOVED:
4355                 for (i = 0; i < NR_CPUS; i++)
4356                         if (net_dma->channels[i] == chan) {
4357                                 found = 1;
4358                                 pos = i;
4359                                 break;
4360                         }
4361 
4362                 if (found) {
4363                         ack = DMA_ACK;
4364                         cpu_clear(pos, net_dma->channel_mask);
4365                         net_dma->channels[i] = NULL;
4366                         net_dma_rebalance(net_dma);
4367                 }
4368                 break;
4369         default:
4370                 break;
4371         }
4372         spin_unlock(&net_dma->lock);
4373 
4374         return ack;
4375 }
4376 
4377 /**
4378  * netdev_dma_regiser - register the networking subsystem as a DMA client
4379  */
4380 static int __init netdev_dma_register(void)
4381 {
4382         spin_lock_init(&net_dma.lock);
4383         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4384         dma_async_client_register(&net_dma.client);
4385         dma_async_client_chan_request(&net_dma.client);
4386         return 0;
4387 }
4388 
4389 #else
4390 static int __init netdev_dma_register(void) { return -ENODEV; }
4391 #endif /* CONFIG_NET_DMA */
4392 
4393 /**
4394  *      netdev_compute_feature - compute conjunction of two feature sets
4395  *      @all: first feature set
4396  *      @one: second feature set
4397  *
4398  *      Computes a new feature set after adding a device with feature set
4399  *      @one to the master device with current feature set @all.  Returns
4400  *      the new feature set.
4401  */
4402 int netdev_compute_features(unsigned long all, unsigned long one)
4403 {
4404         /* if device needs checksumming, downgrade to hw checksumming */
4405         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4406                 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4407 
4408         /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4409         if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4410                 all ^= NETIF_F_HW_CSUM
4411                         | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4412 
4413         if (one & NETIF_F_GSO)
4414                 one |= NETIF_F_GSO_SOFTWARE;
4415         one |= NETIF_F_GSO;
4416 
4417         /* If even one device supports robust GSO, enable it for all. */
4418         if (one & NETIF_F_GSO_ROBUST)
4419                 all |= NETIF_F_GSO_ROBUST;
4420 
4421         all &= one | NETIF_F_LLTX;
4422 
4423         if (!(all & NETIF_F_ALL_CSUM))
4424                 all &= ~NETIF_F_SG;
4425         if (!(all & NETIF_F_SG))
4426                 all &= ~NETIF_F_GSO_MASK;
4427 
4428         return all;
4429 }
4430 EXPORT_SYMBOL(netdev_compute_features);
4431 
4432 static struct hlist_head *netdev_create_hash(void)
4433 {
4434         int i;
4435         struct hlist_head *hash;
4436 
4437         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4438         if (hash != NULL)
4439                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4440                         INIT_HLIST_HEAD(&hash[i]);
4441 
4442         return hash;
4443 }
4444 
4445 /* Initialize per network namespace state */
4446 static int __net_init netdev_init(struct net *net)
4447 {
4448         INIT_LIST_HEAD(&net->dev_base_head);
4449 
4450         net->dev_name_head = netdev_create_hash();
4451         if (net->dev_name_head == NULL)
4452                 goto err_name;
4453 
4454         net->dev_index_head = netdev_create_hash();
4455         if (net->dev_index_head == NULL)
4456                 goto err_idx;
4457 
4458         return 0;
4459 
4460 err_idx:
4461         kfree(net->dev_name_head);
4462 err_name:
4463         return -ENOMEM;
4464 }
4465 
4466 static void __net_exit netdev_exit(struct net *net)
4467 {
4468         kfree(net->dev_name_head);
4469         kfree(net->dev_index_head);
4470 }
4471 
4472 static struct pernet_operations __net_initdata netdev_net_ops = {
4473         .init = netdev_init,
4474         .exit = netdev_exit,
4475 };
4476 
4477 static void __net_exit default_device_exit(struct net *net)
4478 {
4479         struct net_device *dev, *next;
4480         /*
4481          * Push all migratable of the network devices back to the
4482          * initial network namespace
4483          */
4484         rtnl_lock();
4485         for_each_netdev_safe(net, dev, next) {
4486                 int err;
4487 
4488                 /* Ignore unmoveable devices (i.e. loopback) */
4489                 if (dev->features & NETIF_F_NETNS_LOCAL)
4490                         continue;
4491 
4492                 /* Push remaing network devices to init_net */
4493                 err = dev_change_net_namespace(dev, &init_net, "dev%d");
4494                 if (err) {
4495                         printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n",
4496                                 __func__, dev->name, err);
4497                         unregister_netdevice(dev);
4498                 }
4499         }
4500         rtnl_unlock();
4501 }
4502 
4503 static struct pernet_operations __net_initdata default_device_ops = {
4504         .exit = default_device_exit,
4505 };
4506 
4507 /*
4508  *      Initialize the DEV module. At boot time this walks the device list and
4509  *      unhooks any devices that fail to initialise (normally hardware not
4510  *      present) and leaves us with a valid list of present and active devices.
4511  *
4512  */
4513 
4514 /*
4515  *       This is called single threaded during boot, so no need
4516  *       to take the rtnl semaphore.
4517  */
4518 static int __init net_dev_init(void)
4519 {
4520         int i, rc = -ENOMEM;
4521 
4522         BUG_ON(!dev_boot_phase);
4523 
4524         if (dev_proc_init())
4525                 goto out;
4526 
4527         if (netdev_kobject_init())
4528                 goto out;
4529 
4530         INIT_LIST_HEAD(&ptype_all);
4531         for (i = 0; i < PTYPE_HASH_SIZE; i++)
4532                 INIT_LIST_HEAD(&ptype_base[i]);
4533 
4534         if (register_pernet_subsys(&netdev_net_ops))
4535                 goto out;
4536 
4537         if (register_pernet_device(&default_device_ops))
4538                 goto out;
4539 
4540         /*
4541          *      Initialise the packet receive queues.
4542          */
4543 
4544         for_each_possible_cpu(i) {
4545                 struct softnet_data *queue;
4546 
4547                 queue = &per_cpu(softnet_data, i);
4548                 skb_queue_head_init(&queue->input_pkt_queue);
4549                 queue->completion_queue = NULL;
4550                 INIT_LIST_HEAD(&queue->poll_list);
4551 
4552                 queue->backlog.poll = process_backlog;
4553                 queue->backlog.weight = weight_p;
4554         }
4555 
4556         netdev_dma_register();
4557 
4558         dev_boot_phase = 0;
4559 
4560         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4561         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4562 
4563         hotcpu_notifier(dev_cpu_callback, 0);
4564         dst_init();
4565         dev_mcast_init();
4566         rc = 0;
4567 out:
4568         return rc;
4569 }
4570 
4571 subsys_initcall(net_dev_init);
4572 
4573 EXPORT_SYMBOL(__dev_get_by_index);
4574 EXPORT_SYMBOL(__dev_get_by_name);
4575 EXPORT_SYMBOL(__dev_remove_pack);
4576 EXPORT_SYMBOL(dev_valid_name);
4577 EXPORT_SYMBOL(dev_add_pack);
4578 EXPORT_SYMBOL(dev_alloc_name);
4579 EXPORT_SYMBOL(dev_close);
4580 EXPORT_SYMBOL(dev_get_by_flags);
4581 EXPORT_SYMBOL(dev_get_by_index);
4582 EXPORT_SYMBOL(dev_get_by_name);
4583 EXPORT_SYMBOL(dev_open);
4584 EXPORT_SYMBOL(dev_queue_xmit);
4585 EXPORT_SYMBOL(dev_remove_pack);
4586 EXPORT_SYMBOL(dev_set_allmulti);
4587 EXPORT_SYMBOL(dev_set_promiscuity);
4588 EXPORT_SYMBOL(dev_change_flags);
4589 EXPORT_SYMBOL(dev_set_mtu);
4590 EXPORT_SYMBOL(dev_set_mac_address);
4591 EXPORT_SYMBOL(free_netdev);
4592 EXPORT_SYMBOL(netdev_boot_setup_check);
4593 EXPORT_SYMBOL(netdev_set_master);
4594 EXPORT_SYMBOL(netdev_state_change);
4595 EXPORT_SYMBOL(netif_receive_skb);
4596 EXPORT_SYMBOL(netif_rx);
4597 EXPORT_SYMBOL(register_gifconf);
4598 EXPORT_SYMBOL(register_netdevice);
4599 EXPORT_SYMBOL(register_netdevice_notifier);
4600 EXPORT_SYMBOL(skb_checksum_help);
4601 EXPORT_SYMBOL(synchronize_net);
4602 EXPORT_SYMBOL(unregister_netdevice);
4603 EXPORT_SYMBOL(unregister_netdevice_notifier);
4604 EXPORT_SYMBOL(net_enable_timestamp);
4605 EXPORT_SYMBOL(net_disable_timestamp);
4606 EXPORT_SYMBOL(dev_get_flags);
4607 
4608 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4609 EXPORT_SYMBOL(br_handle_frame_hook);
4610 EXPORT_SYMBOL(br_fdb_get_hook);
4611 EXPORT_SYMBOL(br_fdb_put_hook);
4612 #endif
4613 
4614 #ifdef CONFIG_KMOD
4615 EXPORT_SYMBOL(dev_load);
4616 #endif
4617 
4618 EXPORT_PER_CPU_SYMBOL(softnet_data);
4619 
  This page was automatically generated by the LXR engine.