Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              ROUTE - implementation of the IP router.
  7  *
  8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
  9  *
 10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 15  *
 16  * Fixes:
 17  *              Alan Cox        :       Verify area fixes.
 18  *              Alan Cox        :       cli() protects routing changes
 19  *              Rui Oliveira    :       ICMP routing table updates
 20  *              (rco@di.uminho.pt)      Routing table insertion and update
 21  *              Linus Torvalds  :       Rewrote bits to be sensible
 22  *              Alan Cox        :       Added BSD route gw semantics
 23  *              Alan Cox        :       Super /proc >4K 
 24  *              Alan Cox        :       MTU in route table
 25  *              Alan Cox        :       MSS actually. Also added the window
 26  *                                      clamper.
 27  *              Sam Lantinga    :       Fixed route matching in rt_del()
 28  *              Alan Cox        :       Routing cache support.
 29  *              Alan Cox        :       Removed compatibility cruft.
 30  *              Alan Cox        :       RTF_REJECT support.
 31  *              Alan Cox        :       TCP irtt support.
 32  *              Jonathan Naylor :       Added Metric support.
 33  *      Miquel van Smoorenburg  :       BSD API fixes.
 34  *      Miquel van Smoorenburg  :       Metrics.
 35  *              Alan Cox        :       Use __u32 properly
 36  *              Alan Cox        :       Aligned routing errors more closely with BSD
 37  *                                      our system is still very different.
 38  *              Alan Cox        :       Faster /proc handling
 39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 40  *                                      routing caches and better behaviour.
 41  *              
 42  *              Olaf Erb        :       irtt wasn't being copied right.
 43  *              Bjorn Ekwall    :       Kerneld route support.
 44  *              Alan Cox        :       Multicast fixed (I hope)
 45  *              Pavel Krauz     :       Limited broadcast fixed
 46  *              Mike McLagan    :       Routing by source
 47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 48  *                                      route.c and rewritten from scratch.
 49  *              Andi Kleen      :       Load-limit warning messages.
 50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 54  *              Marc Boucher    :       routing by fwmark
 55  *      Robert Olsson           :       Added rt_cache statistics
 56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 57  *
 58  *              This program is free software; you can redistribute it and/or
 59  *              modify it under the terms of the GNU General Public License
 60  *              as published by the Free Software Foundation; either version
 61  *              2 of the License, or (at your option) any later version.
 62  */
 63 
 64 #include <linux/config.h>
 65 #include <linux/module.h>
 66 #include <asm/uaccess.h>
 67 #include <asm/system.h>
 68 #include <linux/bitops.h>
 69 #include <linux/types.h>
 70 #include <linux/kernel.h>
 71 #include <linux/sched.h>
 72 #include <linux/mm.h>
 73 #include <linux/string.h>
 74 #include <linux/socket.h>
 75 #include <linux/sockios.h>
 76 #include <linux/errno.h>
 77 #include <linux/in.h>
 78 #include <linux/inet.h>
 79 #include <linux/netdevice.h>
 80 #include <linux/proc_fs.h>
 81 #include <linux/init.h>
 82 #include <linux/skbuff.h>
 83 #include <linux/rtnetlink.h>
 84 #include <linux/inetdevice.h>
 85 #include <linux/igmp.h>
 86 #include <linux/pkt_sched.h>
 87 #include <linux/mroute.h>
 88 #include <linux/netfilter_ipv4.h>
 89 #include <linux/random.h>
 90 #include <linux/jhash.h>
 91 #include <linux/rcupdate.h>
 92 #include <linux/times.h>
 93 #include <net/protocol.h>
 94 #include <net/ip.h>
 95 #include <net/route.h>
 96 #include <net/inetpeer.h>
 97 #include <net/sock.h>
 98 #include <net/ip_fib.h>
 99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #ifdef CONFIG_SYSCTL
104 #include <linux/sysctl.h>
105 #endif
106 
107 #define IP_MAX_MTU      0xFFF0
108 
109 #define RT_GC_TIMEOUT (300*HZ)
110 
111 static int ip_rt_min_delay              = 2 * HZ;
112 static int ip_rt_max_delay              = 10 * HZ;
113 static int ip_rt_max_size;
114 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
115 static int ip_rt_gc_interval            = 60 * HZ;
116 static int ip_rt_gc_min_interval        = HZ / 2;
117 static int ip_rt_redirect_number        = 9;
118 static int ip_rt_redirect_load          = HZ / 50;
119 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
120 static int ip_rt_error_cost             = HZ;
121 static int ip_rt_error_burst            = 5 * HZ;
122 static int ip_rt_gc_elasticity          = 8;
123 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu               = 512 + 20 + 20;
125 static int ip_rt_min_advmss             = 256;
126 static int ip_rt_secret_interval        = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
128 
129 #define RTprint(a...)   printk(KERN_DEBUG a)
130 
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
134 
135 /*
136  *      Interface to generic destination cache.
137  */
138 
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void              ipv4_dst_destroy(struct dst_entry *dst);
141 static void              ipv4_dst_ifdown(struct dst_entry *dst,
142                                          struct net_device *dev, int how);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(void);
147 
148 
149 static struct dst_ops ipv4_dst_ops = {
150         .family =               AF_INET,
151         .protocol =             __constant_htons(ETH_P_IP),
152         .gc =                   rt_garbage_collect,
153         .check =                ipv4_dst_check,
154         .destroy =              ipv4_dst_destroy,
155         .ifdown =               ipv4_dst_ifdown,
156         .negative_advice =      ipv4_negative_advice,
157         .link_failure =         ipv4_link_failure,
158         .update_pmtu =          ip_rt_update_pmtu,
159         .entry_size =           sizeof(struct rtable),
160 };
161 
162 #define ECN_OR_COST(class)      TC_PRIO_##class
163 
164 __u8 ip_tos2prio[16] = {
165         TC_PRIO_BESTEFFORT,
166         ECN_OR_COST(FILLER),
167         TC_PRIO_BESTEFFORT,
168         ECN_OR_COST(BESTEFFORT),
169         TC_PRIO_BULK,
170         ECN_OR_COST(BULK),
171         TC_PRIO_BULK,
172         ECN_OR_COST(BULK),
173         TC_PRIO_INTERACTIVE,
174         ECN_OR_COST(INTERACTIVE),
175         TC_PRIO_INTERACTIVE,
176         ECN_OR_COST(INTERACTIVE),
177         TC_PRIO_INTERACTIVE_BULK,
178         ECN_OR_COST(INTERACTIVE_BULK),
179         TC_PRIO_INTERACTIVE_BULK,
180         ECN_OR_COST(INTERACTIVE_BULK)
181 };
182 
183 
184 /*
185  * Route cache.
186  */
187 
188 /* The locking scheme is rather straight forward:
189  *
190  * 1) Read-Copy Update protects the buckets of the central route hash.
191  * 2) Only writers remove entries, and they hold the lock
192  *    as they look at rtable reference counts.
193  * 3) Only readers acquire references to rtable entries,
194  *    they do so with atomic increments and with the
195  *    lock held.
196  */
197 
198 struct rt_hash_bucket {
199         struct rtable   *chain;
200         spinlock_t      lock;
201 } __attribute__((__aligned__(8)));
202 
203 static struct rt_hash_bucket    *rt_hash_table;
204 static unsigned                 rt_hash_mask;
205 static int                      rt_hash_log;
206 static unsigned int             rt_hash_rnd;
207 
208 struct rt_cache_stat *rt_cache_stat;
209 
210 static int rt_intern_hash(unsigned hash, struct rtable *rth,
211                                 struct rtable **res);
212 
213 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
214 {
215         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
216                 & rt_hash_mask);
217 }
218 
219 #ifdef CONFIG_PROC_FS
220 struct rt_cache_iter_state {
221         int bucket;
222 };
223 
224 static struct rtable *rt_cache_get_first(struct seq_file *seq)
225 {
226         struct rtable *r = NULL;
227         struct rt_cache_iter_state *st = seq->private;
228 
229         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
230                 rcu_read_lock_bh();
231                 r = rt_hash_table[st->bucket].chain;
232                 if (r)
233                         break;
234                 rcu_read_unlock_bh();
235         }
236         return r;
237 }
238 
239 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
240 {
241         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
242 
243         r = r->u.rt_next;
244         while (!r) {
245                 rcu_read_unlock_bh();
246                 if (--st->bucket < 0)
247                         break;
248                 rcu_read_lock_bh();
249                 r = rt_hash_table[st->bucket].chain;
250         }
251         return r;
252 }
253 
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
255 {
256         struct rtable *r = rt_cache_get_first(seq);
257 
258         if (r)
259                 while (pos && (r = rt_cache_get_next(seq, r)))
260                         --pos;
261         return pos ? NULL : r;
262 }
263 
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
265 {
266         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
267 }
268 
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270 {
271         struct rtable *r = NULL;
272 
273         if (v == SEQ_START_TOKEN)
274                 r = rt_cache_get_first(seq);
275         else
276                 r = rt_cache_get_next(seq, v);
277         ++*pos;
278         return r;
279 }
280 
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
282 {
283         if (v && v != SEQ_START_TOKEN)
284                 rcu_read_unlock_bh();
285 }
286 
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
288 {
289         if (v == SEQ_START_TOKEN)
290                 seq_printf(seq, "%-127s\n",
291                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
293                            "HHUptod\tSpecDst");
294         else {
295                 struct rtable *r = v;
296                 char temp[256];
297 
298                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300                         r->u.dst.dev ? r->u.dst.dev->name : "*",
301                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
304                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306                         dst_metric(&r->u.dst, RTAX_WINDOW),
307                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
309                         r->fl.fl4_tos,
310                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
312                                        dev_queue_xmit) : 0,
313                         r->rt_spec_dst);
314                 seq_printf(seq, "%-127s\n", temp);
315         }
316         return 0;
317 }
318 
319 static struct seq_operations rt_cache_seq_ops = {
320         .start  = rt_cache_seq_start,
321         .next   = rt_cache_seq_next,
322         .stop   = rt_cache_seq_stop,
323         .show   = rt_cache_seq_show,
324 };
325 
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
327 {
328         struct seq_file *seq;
329         int rc = -ENOMEM;
330         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
331 
332         if (!s)
333                 goto out;
334         rc = seq_open(file, &rt_cache_seq_ops);
335         if (rc)
336                 goto out_kfree;
337         seq          = file->private_data;
338         seq->private = s;
339         memset(s, 0, sizeof(*s));
340 out:
341         return rc;
342 out_kfree:
343         kfree(s);
344         goto out;
345 }
346 
347 static struct file_operations rt_cache_seq_fops = {
348         .owner   = THIS_MODULE,
349         .open    = rt_cache_seq_open,
350         .read    = seq_read,
351         .llseek  = seq_lseek,
352         .release = seq_release_private,
353 };
354 
355 
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
357 {
358         int cpu;
359 
360         if (*pos == 0)
361                 return SEQ_START_TOKEN;
362 
363         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
364                 if (!cpu_possible(cpu))
365                         continue;
366                 *pos = cpu+1;
367                 return per_cpu_ptr(rt_cache_stat, cpu);
368         }
369         return NULL;
370 }
371 
372 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373 {
374         int cpu;
375 
376         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
377                 if (!cpu_possible(cpu))
378                         continue;
379                 *pos = cpu+1;
380                 return per_cpu_ptr(rt_cache_stat, cpu);
381         }
382         return NULL;
383         
384 }
385 
386 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
387 {
388 
389 }
390 
391 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
392 {
393         struct rt_cache_stat *st = v;
394 
395         if (v == SEQ_START_TOKEN) {
396                 seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
397                 return 0;
398         }
399         
400         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
401                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
402                    atomic_read(&ipv4_dst_ops.entries),
403                    st->in_hit,
404                    st->in_slow_tot,
405                    st->in_slow_mc,
406                    st->in_no_route,
407                    st->in_brd,
408                    st->in_martian_dst,
409                    st->in_martian_src,
410 
411                    st->out_hit,
412                    st->out_slow_tot,
413                    st->out_slow_mc, 
414 
415                    st->gc_total,
416                    st->gc_ignored,
417                    st->gc_goal_miss,
418                    st->gc_dst_overflow,
419                    st->in_hlist_search,
420                    st->out_hlist_search
421                 );
422         return 0;
423 }
424 
425 static struct seq_operations rt_cpu_seq_ops = {
426         .start  = rt_cpu_seq_start,
427         .next   = rt_cpu_seq_next,
428         .stop   = rt_cpu_seq_stop,
429         .show   = rt_cpu_seq_show,
430 };
431 
432 
433 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
434 {
435         return seq_open(file, &rt_cpu_seq_ops);
436 }
437 
438 static struct file_operations rt_cpu_seq_fops = {
439         .owner   = THIS_MODULE,
440         .open    = rt_cpu_seq_open,
441         .read    = seq_read,
442         .llseek  = seq_lseek,
443         .release = seq_release,
444 };
445 
446 #endif /* CONFIG_PROC_FS */
447   
448 static __inline__ void rt_free(struct rtable *rt)
449 {
450         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
451 }
452 
453 static __inline__ void rt_drop(struct rtable *rt)
454 {
455         ip_rt_put(rt);
456         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
457 }
458 
459 static __inline__ int rt_fast_clean(struct rtable *rth)
460 {
461         /* Kill broadcast/multicast entries very aggresively, if they
462            collide in hash table with more useful entries */
463         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
464                 rth->fl.iif && rth->u.rt_next;
465 }
466 
467 static __inline__ int rt_valuable(struct rtable *rth)
468 {
469         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
470                 rth->u.dst.expires;
471 }
472 
473 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
474 {
475         unsigned long age;
476         int ret = 0;
477 
478         if (atomic_read(&rth->u.dst.__refcnt))
479                 goto out;
480 
481         ret = 1;
482         if (rth->u.dst.expires &&
483             time_after_eq(jiffies, rth->u.dst.expires))
484                 goto out;
485 
486         age = jiffies - rth->u.dst.lastuse;
487         ret = 0;
488         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
489             (age <= tmo2 && rt_valuable(rth)))
490                 goto out;
491         ret = 1;
492 out:    return ret;
493 }
494 
495 /* Bits of score are:
496  * 31: very valuable
497  * 30: not quite useless
498  * 29..0: usage counter
499  */
500 static inline u32 rt_score(struct rtable *rt)
501 {
502         u32 score = jiffies - rt->u.dst.lastuse;
503 
504         score = ~score & ~(3<<30);
505 
506         if (rt_valuable(rt))
507                 score |= (1<<31);
508 
509         if (!rt->fl.iif ||
510             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
511                 score |= (1<<30);
512 
513         return score;
514 }
515 
516 /* This runs via a timer and thus is always in BH context. */
517 static void rt_check_expire(unsigned long dummy)
518 {
519         static int rover;
520         int i = rover, t;
521         struct rtable *rth, **rthp;
522         unsigned long now = jiffies;
523 
524         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
525              t -= ip_rt_gc_timeout) {
526                 unsigned long tmo = ip_rt_gc_timeout;
527 
528                 i = (i + 1) & rt_hash_mask;
529                 rthp = &rt_hash_table[i].chain;
530 
531                 spin_lock(&rt_hash_table[i].lock);
532                 while ((rth = *rthp) != NULL) {
533                         if (rth->u.dst.expires) {
534                                 /* Entry is expired even if it is in use */
535                                 if (time_before_eq(now, rth->u.dst.expires)) {
536                                         tmo >>= 1;
537                                         rthp = &rth->u.rt_next;
538                                         continue;
539                                 }
540                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
541                                 tmo >>= 1;
542                                 rthp = &rth->u.rt_next;
543                                 continue;
544                         }
545 
546                         /* Cleanup aged off entries. */
547                         *rthp = rth->u.rt_next;
548                         rt_free(rth);
549                 }
550                 spin_unlock(&rt_hash_table[i].lock);
551 
552                 /* Fallback loop breaker. */
553                 if (time_after(jiffies, now))
554                         break;
555         }
556         rover = i;
557         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
558 }
559 
560 /* This can run from both BH and non-BH contexts, the latter
561  * in the case of a forced flush event.
562  */
563 static void rt_run_flush(unsigned long dummy)
564 {
565         int i;
566         struct rtable *rth, *next;
567 
568         rt_deadline = 0;
569 
570         get_random_bytes(&rt_hash_rnd, 4);
571 
572         for (i = rt_hash_mask; i >= 0; i--) {
573                 spin_lock_bh(&rt_hash_table[i].lock);
574                 rth = rt_hash_table[i].chain;
575                 if (rth)
576                         rt_hash_table[i].chain = NULL;
577                 spin_unlock_bh(&rt_hash_table[i].lock);
578 
579                 for (; rth; rth = next) {
580                         next = rth->u.rt_next;
581                         rt_free(rth);
582                 }
583         }
584 }
585 
586 static DEFINE_SPINLOCK(rt_flush_lock);
587 
588 void rt_cache_flush(int delay)
589 {
590         unsigned long now = jiffies;
591         int user_mode = !in_softirq();
592 
593         if (delay < 0)
594                 delay = ip_rt_min_delay;
595 
596         spin_lock_bh(&rt_flush_lock);
597 
598         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
599                 long tmo = (long)(rt_deadline - now);
600 
601                 /* If flush timer is already running
602                    and flush request is not immediate (delay > 0):
603 
604                    if deadline is not achieved, prolongate timer to "delay",
605                    otherwise fire it at deadline time.
606                  */
607 
608                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
609                         tmo = 0;
610                 
611                 if (delay > tmo)
612                         delay = tmo;
613         }
614 
615         if (delay <= 0) {
616                 spin_unlock_bh(&rt_flush_lock);
617                 rt_run_flush(0);
618                 return;
619         }
620 
621         if (rt_deadline == 0)
622                 rt_deadline = now + ip_rt_max_delay;
623 
624         mod_timer(&rt_flush_timer, now+delay);
625         spin_unlock_bh(&rt_flush_lock);
626 }
627 
628 static void rt_secret_rebuild(unsigned long dummy)
629 {
630         unsigned long now = jiffies;
631 
632         rt_cache_flush(0);
633         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
634 }
635 
636 /*
637    Short description of GC goals.
638 
639    We want to build algorithm, which will keep routing cache
640    at some equilibrium point, when number of aged off entries
641    is kept approximately equal to newly generated ones.
642 
643    Current expiration strength is variable "expire".
644    We try to adjust it dynamically, so that if networking
645    is idle expires is large enough to keep enough of warm entries,
646    and when load increases it reduces to limit cache size.
647  */
648 
649 static int rt_garbage_collect(void)
650 {
651         static unsigned long expire = RT_GC_TIMEOUT;
652         static unsigned long last_gc;
653         static int rover;
654         static int equilibrium;
655         struct rtable *rth, **rthp;
656         unsigned long now = jiffies;
657         int goal;
658 
659         /*
660          * Garbage collection is pretty expensive,
661          * do not make it too frequently.
662          */
663 
664         RT_CACHE_STAT_INC(gc_total);
665 
666         if (now - last_gc < ip_rt_gc_min_interval &&
667             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
668                 RT_CACHE_STAT_INC(gc_ignored);
669                 goto out;
670         }
671 
672         /* Calculate number of entries, which we want to expire now. */
673         goal = atomic_read(&ipv4_dst_ops.entries) -
674                 (ip_rt_gc_elasticity << rt_hash_log);
675         if (goal <= 0) {
676                 if (equilibrium < ipv4_dst_ops.gc_thresh)
677                         equilibrium = ipv4_dst_ops.gc_thresh;
678                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
679                 if (goal > 0) {
680                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
681                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
682                 }
683         } else {
684                 /* We are in dangerous area. Try to reduce cache really
685                  * aggressively.
686                  */
687                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
688                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
689         }
690 
691         if (now - last_gc >= ip_rt_gc_min_interval)
692                 last_gc = now;
693 
694         if (goal <= 0) {
695                 equilibrium += goal;
696                 goto work_done;
697         }
698 
699         do {
700                 int i, k;
701 
702                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
703                         unsigned long tmo = expire;
704 
705                         k = (k + 1) & rt_hash_mask;
706                         rthp = &rt_hash_table[k].chain;
707                         spin_lock_bh(&rt_hash_table[k].lock);
708                         while ((rth = *rthp) != NULL) {
709                                 if (!rt_may_expire(rth, tmo, expire)) {
710                                         tmo >>= 1;
711                                         rthp = &rth->u.rt_next;
712                                         continue;
713                                 }
714                                 *rthp = rth->u.rt_next;
715                                 rt_free(rth);
716                                 goal--;
717                         }
718                         spin_unlock_bh(&rt_hash_table[k].lock);
719                         if (goal <= 0)
720                                 break;
721                 }
722                 rover = k;
723 
724                 if (goal <= 0)
725                         goto work_done;
726 
727                 /* Goal is not achieved. We stop process if:
728 
729                    - if expire reduced to zero. Otherwise, expire is halfed.
730                    - if table is not full.
731                    - if we are called from interrupt.
732                    - jiffies check is just fallback/debug loop breaker.
733                      We will not spin here for long time in any case.
734                  */
735 
736                 RT_CACHE_STAT_INC(gc_goal_miss);
737 
738                 if (expire == 0)
739                         break;
740 
741                 expire >>= 1;
742 #if RT_CACHE_DEBUG >= 2
743                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
744                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
745 #endif
746 
747                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
748                         goto out;
749         } while (!in_softirq() && time_before_eq(jiffies, now));
750 
751         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
752                 goto out;
753         if (net_ratelimit())
754                 printk(KERN_WARNING "dst cache overflow\n");
755         RT_CACHE_STAT_INC(gc_dst_overflow);
756         return 1;
757 
758 work_done:
759         expire += ip_rt_gc_min_interval;
760         if (expire > ip_rt_gc_timeout ||
761             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
762                 expire = ip_rt_gc_timeout;
763 #if RT_CACHE_DEBUG >= 2
764         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
765                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
766 #endif
767 out:    return 0;
768 }
769 
770 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
771 {
772         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
773                fl1->oif     == fl2->oif &&
774                fl1->iif     == fl2->iif;
775 }
776 
777 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
778 {
779         struct rtable   *rth, **rthp;
780         unsigned long   now;
781         struct rtable *cand, **candp;
782         u32             min_score;
783         int             chain_length;
784         int attempts = !in_softirq();
785 
786 restart:
787         chain_length = 0;
788         min_score = ~(u32)0;
789         cand = NULL;
790         candp = NULL;
791         now = jiffies;
792 
793         rthp = &rt_hash_table[hash].chain;
794 
795         spin_lock_bh(&rt_hash_table[hash].lock);
796         while ((rth = *rthp) != NULL) {
797                 if (compare_keys(&rth->fl, &rt->fl)) {
798                         /* Put it first */
799                         *rthp = rth->u.rt_next;
800                         /*
801                          * Since lookup is lockfree, the deletion
802                          * must be visible to another weakly ordered CPU before
803                          * the insertion at the start of the hash chain.
804                          */
805                         rcu_assign_pointer(rth->u.rt_next,
806                                            rt_hash_table[hash].chain);
807                         /*
808                          * Since lookup is lockfree, the update writes
809                          * must be ordered for consistency on SMP.
810                          */
811                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
812 
813                         rth->u.dst.__use++;
814                         dst_hold(&rth->u.dst);
815                         rth->u.dst.lastuse = now;
816                         spin_unlock_bh(&rt_hash_table[hash].lock);
817 
818                         rt_drop(rt);
819                         *rp = rth;
820                         return 0;
821                 }
822 
823                 if (!atomic_read(&rth->u.dst.__refcnt)) {
824                         u32 score = rt_score(rth);
825 
826                         if (score <= min_score) {
827                                 cand = rth;
828                                 candp = rthp;
829                                 min_score = score;
830                         }
831                 }
832 
833                 chain_length++;
834 
835                 rthp = &rth->u.rt_next;
836         }
837 
838         if (cand) {
839                 /* ip_rt_gc_elasticity used to be average length of chain
840                  * length, when exceeded gc becomes really aggressive.
841                  *
842                  * The second limit is less certain. At the moment it allows
843                  * only 2 entries per bucket. We will see.
844                  */
845                 if (chain_length > ip_rt_gc_elasticity) {
846                         *candp = cand->u.rt_next;
847                         rt_free(cand);
848                 }
849         }
850 
851         /* Try to bind route to arp only if it is output
852            route or unicast forwarding path.
853          */
854         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
855                 int err = arp_bind_neighbour(&rt->u.dst);
856                 if (err) {
857                         spin_unlock_bh(&rt_hash_table[hash].lock);
858 
859                         if (err != -ENOBUFS) {
860                                 rt_drop(rt);
861                                 return err;
862                         }
863 
864                         /* Neighbour tables are full and nothing
865                            can be released. Try to shrink route cache,
866                            it is most likely it holds some neighbour records.
867                          */
868                         if (attempts-- > 0) {
869                                 int saved_elasticity = ip_rt_gc_elasticity;
870                                 int saved_int = ip_rt_gc_min_interval;
871                                 ip_rt_gc_elasticity     = 1;
872                                 ip_rt_gc_min_interval   = 0;
873                                 rt_garbage_collect();
874                                 ip_rt_gc_min_interval   = saved_int;
875                                 ip_rt_gc_elasticity     = saved_elasticity;
876                                 goto restart;
877                         }
878 
879                         if (net_ratelimit())
880                                 printk(KERN_WARNING "Neighbour table overflow.\n");
881                         rt_drop(rt);
882                         return -ENOBUFS;
883                 }
884         }
885 
886         rt->u.rt_next = rt_hash_table[hash].chain;
887 #if RT_CACHE_DEBUG >= 2
888         if (rt->u.rt_next) {
889                 struct rtable *trt;
890                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
891                        NIPQUAD(rt->rt_dst));
892                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
893                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
894                 printk("\n");
895         }
896 #endif
897         rt_hash_table[hash].chain = rt;
898         spin_unlock_bh(&rt_hash_table[hash].lock);
899         *rp = rt;
900         return 0;
901 }
902 
903 void rt_bind_peer(struct rtable *rt, int create)
904 {
905         static DEFINE_SPINLOCK(rt_peer_lock);
906         struct inet_peer *peer;
907 
908         peer = inet_getpeer(rt->rt_dst, create);
909 
910         spin_lock_bh(&rt_peer_lock);
911         if (rt->peer == NULL) {
912                 rt->peer = peer;
913                 peer = NULL;
914         }
915         spin_unlock_bh(&rt_peer_lock);
916         if (peer)
917                 inet_putpeer(peer);
918 }
919 
920 /*
921  * Peer allocation may fail only in serious out-of-memory conditions.  However
922  * we still can generate some output.
923  * Random ID selection looks a bit dangerous because we have no chances to
924  * select ID being unique in a reasonable period of time.
925  * But broken packet identifier may be better than no packet at all.
926  */
927 static void ip_select_fb_ident(struct iphdr *iph)
928 {
929         static DEFINE_SPINLOCK(ip_fb_id_lock);
930         static u32 ip_fallback_id;
931         u32 salt;
932 
933         spin_lock_bh(&ip_fb_id_lock);
934         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
935         iph->id = htons(salt & 0xFFFF);
936         ip_fallback_id = salt;
937         spin_unlock_bh(&ip_fb_id_lock);
938 }
939 
940 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
941 {
942         struct rtable *rt = (struct rtable *) dst;
943 
944         if (rt) {
945                 if (rt->peer == NULL)
946                         rt_bind_peer(rt, 1);
947 
948                 /* If peer is attached to destination, it is never detached,
949                    so that we need not to grab a lock to dereference it.
950                  */
951                 if (rt->peer) {
952                         iph->id = htons(inet_getid(rt->peer, more));
953                         return;
954                 }
955         } else
956                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
957 
958         ip_select_fb_ident(iph);
959 }
960 
961 static void rt_del(unsigned hash, struct rtable *rt)
962 {
963         struct rtable **rthp;
964 
965         spin_lock_bh(&rt_hash_table[hash].lock);
966         ip_rt_put(rt);
967         for (rthp = &rt_hash_table[hash].chain; *rthp;
968              rthp = &(*rthp)->u.rt_next)
969                 if (*rthp == rt) {
970                         *rthp = rt->u.rt_next;
971                         rt_free(rt);
972                         break;
973                 }
974         spin_unlock_bh(&rt_hash_table[hash].lock);
975 }
976 
977 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
978                     u32 saddr, u8 tos, struct net_device *dev)
979 {
980         int i, k;
981         struct in_device *in_dev = in_dev_get(dev);
982         struct rtable *rth, **rthp;
983         u32  skeys[2] = { saddr, 0 };
984         int  ikeys[2] = { dev->ifindex, 0 };
985 
986         tos &= IPTOS_RT_MASK;
987 
988         if (!in_dev)
989                 return;
990 
991         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
992             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
993                 goto reject_redirect;
994 
995         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
996                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
997                         goto reject_redirect;
998                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
999                         goto reject_redirect;
1000         } else {
1001                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1002                         goto reject_redirect;
1003         }
1004 
1005         for (i = 0; i < 2; i++) {
1006                 for (k = 0; k < 2; k++) {
1007                         unsigned hash = rt_hash_code(daddr,
1008                                                      skeys[i] ^ (ikeys[k] << 5),
1009                                                      tos);
1010 
1011                         rthp=&rt_hash_table[hash].chain;
1012 
1013                         rcu_read_lock();
1014                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1015                                 struct rtable *rt;
1016 
1017                                 if (rth->fl.fl4_dst != daddr ||
1018                                     rth->fl.fl4_src != skeys[i] ||
1019                                     rth->fl.fl4_tos != tos ||
1020                                     rth->fl.oif != ikeys[k] ||
1021                                     rth->fl.iif != 0) {
1022                                         rthp = &rth->u.rt_next;
1023                                         continue;
1024                                 }
1025 
1026                                 if (rth->rt_dst != daddr ||
1027                                     rth->rt_src != saddr ||
1028                                     rth->u.dst.error ||
1029                                     rth->rt_gateway != old_gw ||
1030                                     rth->u.dst.dev != dev)
1031                                         break;
1032 
1033                                 dst_hold(&rth->u.dst);
1034                                 rcu_read_unlock();
1035 
1036                                 rt = dst_alloc(&ipv4_dst_ops);
1037                                 if (rt == NULL) {
1038                                         ip_rt_put(rth);
1039                                         in_dev_put(in_dev);
1040                                         return;
1041                                 }
1042 
1043                                 /* Copy all the information. */
1044                                 *rt = *rth;
1045                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1046                                 rt->u.dst.__use         = 1;
1047                                 atomic_set(&rt->u.dst.__refcnt, 1);
1048                                 rt->u.dst.child         = NULL;
1049                                 if (rt->u.dst.dev)
1050                                         dev_hold(rt->u.dst.dev);
1051                                 if (rt->idev)
1052                                         in_dev_hold(rt->idev);
1053                                 rt->u.dst.obsolete      = 0;
1054                                 rt->u.dst.lastuse       = jiffies;
1055                                 rt->u.dst.path          = &rt->u.dst;
1056                                 rt->u.dst.neighbour     = NULL;
1057                                 rt->u.dst.hh            = NULL;
1058                                 rt->u.dst.xfrm          = NULL;
1059 
1060                                 rt->rt_flags            |= RTCF_REDIRECTED;
1061 
1062                                 /* Gateway is different ... */
1063                                 rt->rt_gateway          = new_gw;
1064 
1065                                 /* Redirect received -> path was valid */
1066                                 dst_confirm(&rth->u.dst);
1067 
1068                                 if (rt->peer)
1069                                         atomic_inc(&rt->peer->refcnt);
1070 
1071                                 if (arp_bind_neighbour(&rt->u.dst) ||
1072                                     !(rt->u.dst.neighbour->nud_state &
1073                                             NUD_VALID)) {
1074                                         if (rt->u.dst.neighbour)
1075                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1076                                         ip_rt_put(rth);
1077                                         rt_drop(rt);
1078                                         goto do_next;
1079                                 }
1080 
1081                                 rt_del(hash, rth);
1082                                 if (!rt_intern_hash(hash, rt, &rt))
1083                                         ip_rt_put(rt);
1084                                 goto do_next;
1085                         }
1086                         rcu_read_unlock();
1087                 do_next:
1088                         ;
1089                 }
1090         }
1091         in_dev_put(in_dev);
1092         return;
1093 
1094 reject_redirect:
1095 #ifdef CONFIG_IP_ROUTE_VERBOSE
1096         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1097                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1098                         "%u.%u.%u.%u ignored.\n"
1099                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1100                         "tos %02x\n",
1101                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1102                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1103 #endif
1104         in_dev_put(in_dev);
1105 }
1106 
1107 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1108 {
1109         struct rtable *rt = (struct rtable*)dst;
1110         struct dst_entry *ret = dst;
1111 
1112         if (rt) {
1113                 if (dst->obsolete) {
1114                         ip_rt_put(rt);
1115                         ret = NULL;
1116                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1117                            rt->u.dst.expires) {
1118                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1119                                                      rt->fl.fl4_src ^
1120                                                         (rt->fl.oif << 5),
1121                                                      rt->fl.fl4_tos);
1122 #if RT_CACHE_DEBUG >= 1
1123                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1124                                           "%u.%u.%u.%u/%02x dropped\n",
1125                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1126 #endif
1127                         rt_del(hash, rt);
1128                         ret = NULL;
1129                 }
1130         }
1131         return ret;
1132 }
1133 
1134 /*
1135  * Algorithm:
1136  *      1. The first ip_rt_redirect_number redirects are sent
1137  *         with exponential backoff, then we stop sending them at all,
1138  *         assuming that the host ignores our redirects.
1139  *      2. If we did not see packets requiring redirects
1140  *         during ip_rt_redirect_silence, we assume that the host
1141  *         forgot redirected route and start to send redirects again.
1142  *
1143  * This algorithm is much cheaper and more intelligent than dumb load limiting
1144  * in icmp.c.
1145  *
1146  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1147  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1148  */
1149 
1150 void ip_rt_send_redirect(struct sk_buff *skb)
1151 {
1152         struct rtable *rt = (struct rtable*)skb->dst;
1153         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1154 
1155         if (!in_dev)
1156                 return;
1157 
1158         if (!IN_DEV_TX_REDIRECTS(in_dev))
1159                 goto out;
1160 
1161         /* No redirected packets during ip_rt_redirect_silence;
1162          * reset the algorithm.
1163          */
1164         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1165                 rt->u.dst.rate_tokens = 0;
1166 
1167         /* Too many ignored redirects; do not send anything
1168          * set u.dst.rate_last to the last seen redirected packet.
1169          */
1170         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1171                 rt->u.dst.rate_last = jiffies;
1172                 goto out;
1173         }
1174 
1175         /* Check for load limit; set rate_last to the latest sent
1176          * redirect.
1177          */
1178         if (time_after(jiffies,
1179                        (rt->u.dst.rate_last +
1180                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1181                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1182                 rt->u.dst.rate_last = jiffies;
1183                 ++rt->u.dst.rate_tokens;
1184 #ifdef CONFIG_IP_ROUTE_VERBOSE
1185                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1186                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1187                     net_ratelimit())
1188                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1189                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1190                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1191                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1192 #endif
1193         }
1194 out:
1195         in_dev_put(in_dev);
1196 }
1197 
1198 static int ip_error(struct sk_buff *skb)
1199 {
1200         struct rtable *rt = (struct rtable*)skb->dst;
1201         unsigned long now;
1202         int code;
1203 
1204         switch (rt->u.dst.error) {
1205                 case EINVAL:
1206                 default:
1207                         goto out;
1208                 case EHOSTUNREACH:
1209                         code = ICMP_HOST_UNREACH;
1210                         break;
1211                 case ENETUNREACH:
1212                         code = ICMP_NET_UNREACH;
1213                         break;
1214                 case EACCES:
1215                         code = ICMP_PKT_FILTERED;
1216                         break;
1217         }
1218 
1219         now = jiffies;
1220         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1221         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1222                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1223         rt->u.dst.rate_last = now;
1224         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1225                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1226                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1227         }
1228 
1229 out:    kfree_skb(skb);
1230         return 0;
1231 } 
1232 
1233 /*
1234  *      The last two values are not from the RFC but
1235  *      are needed for AMPRnet AX.25 paths.
1236  */
1237 
1238 static unsigned short mtu_plateau[] =
1239 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1240 
1241 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1242 {
1243         int i;
1244         
1245         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1246                 if (old_mtu > mtu_plateau[i])
1247                         return mtu_plateau[i];
1248         return 68;
1249 }
1250 
1251 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1252 {
1253         int i;
1254         unsigned short old_mtu = ntohs(iph->tot_len);
1255         struct rtable *rth;
1256         u32  skeys[2] = { iph->saddr, 0, };
1257         u32  daddr = iph->daddr;
1258         u8   tos = iph->tos & IPTOS_RT_MASK;
1259         unsigned short est_mtu = 0;
1260 
1261         if (ipv4_config.no_pmtu_disc)
1262                 return 0;
1263 
1264         for (i = 0; i < 2; i++) {
1265                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1266 
1267                 rcu_read_lock();
1268                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1269                      rth = rcu_dereference(rth->u.rt_next)) {
1270                         if (rth->fl.fl4_dst == daddr &&
1271                             rth->fl.fl4_src == skeys[i] &&
1272                             rth->rt_dst  == daddr &&
1273                             rth->rt_src  == iph->saddr &&
1274                             rth->fl.fl4_tos == tos &&
1275                             rth->fl.iif == 0 &&
1276                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1277                                 unsigned short mtu = new_mtu;
1278 
1279                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1280 
1281                                         /* BSD 4.2 compatibility hack :-( */
1282                                         if (mtu == 0 &&
1283                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1284                                             old_mtu >= 68 + (iph->ihl << 2))
1285                                                 old_mtu -= iph->ihl << 2;
1286 
1287                                         mtu = guess_mtu(old_mtu);
1288                                 }
1289                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1290                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1291                                                 dst_confirm(&rth->u.dst);
1292                                                 if (mtu < ip_rt_min_pmtu) {
1293                                                         mtu = ip_rt_min_pmtu;
1294                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1295                                                                 (1 << RTAX_MTU);
1296                                                 }
1297                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1298                                                 dst_set_expires(&rth->u.dst,
1299                                                         ip_rt_mtu_expires);
1300                                         }
1301                                         est_mtu = mtu;
1302                                 }
1303                         }
1304                 }
1305                 rcu_read_unlock();
1306         }
1307         return est_mtu ? : new_mtu;
1308 }
1309 
1310 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1311 {
1312         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1313             !(dst_metric_locked(dst, RTAX_MTU))) {
1314                 if (mtu < ip_rt_min_pmtu) {
1315                         mtu = ip_rt_min_pmtu;
1316                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1317                 }
1318                 dst->metrics[RTAX_MTU-1] = mtu;
1319                 dst_set_expires(dst, ip_rt_mtu_expires);
1320         }
1321 }
1322 
1323 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1324 {
1325         dst_release(dst);
1326         return NULL;
1327 }
1328 
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332         struct inet_peer *peer = rt->peer;
1333         struct in_device *idev = rt->idev;
1334 
1335         if (peer) {
1336                 rt->peer = NULL;
1337                 inet_putpeer(peer);
1338         }
1339 
1340         if (idev) {
1341                 rt->idev = NULL;
1342                 in_dev_put(idev);
1343         }
1344 }
1345 
1346 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1347                             int how)
1348 {
1349         struct rtable *rt = (struct rtable *) dst;
1350         struct in_device *idev = rt->idev;
1351         if (dev != &loopback_dev && idev && idev->dev == dev) {
1352                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1353                 if (loopback_idev) {
1354                         rt->idev = loopback_idev;
1355                         in_dev_put(idev);
1356                 }
1357         }
1358 }
1359 
1360 static void ipv4_link_failure(struct sk_buff *skb)
1361 {
1362         struct rtable *rt;
1363 
1364         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1365 
1366         rt = (struct rtable *) skb->dst;
1367         if (rt)
1368                 dst_set_expires(&rt->u.dst, 0);
1369 }
1370 
1371 static int ip_rt_bug(struct sk_buff *skb)
1372 {
1373         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1374                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1375                 skb->dev ? skb->dev->name : "?");
1376         kfree_skb(skb);
1377         return 0;
1378 }
1379 
1380 /*
1381    We do not cache source address of outgoing interface,
1382    because it is used only by IP RR, TS and SRR options,
1383    so that it out of fast path.
1384 
1385    BTW remember: "addr" is allowed to be not aligned
1386    in IP options!
1387  */
1388 
1389 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1390 {
1391         u32 src;
1392         struct fib_result res;
1393 
1394         if (rt->fl.iif == 0)
1395                 src = rt->rt_src;
1396         else if (fib_lookup(&rt->fl, &res) == 0) {
1397                 src = FIB_RES_PREFSRC(res);
1398                 fib_res_put(&res);
1399         } else
1400                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1401                                         RT_SCOPE_UNIVERSE);
1402         memcpy(addr, &src, 4);
1403 }
1404 
1405 #ifdef CONFIG_NET_CLS_ROUTE
1406 static void set_class_tag(struct rtable *rt, u32 tag)
1407 {
1408         if (!(rt->u.dst.tclassid & 0xFFFF))
1409                 rt->u.dst.tclassid |= tag & 0xFFFF;
1410         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1411                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1412 }
1413 #endif
1414 
1415 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1416 {
1417         struct fib_info *fi = res->fi;
1418 
1419         if (fi) {
1420                 if (FIB_RES_GW(*res) &&
1421                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1422                         rt->rt_gateway = FIB_RES_GW(*res);
1423                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1424                        sizeof(rt->u.dst.metrics));
1425                 if (fi->fib_mtu == 0) {
1426                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1427                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1428                             rt->rt_gateway != rt->rt_dst &&
1429                             rt->u.dst.dev->mtu > 576)
1430                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1431                 }
1432 #ifdef CONFIG_NET_CLS_ROUTE
1433                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1434 #endif
1435         } else
1436                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1437 
1438         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1439                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1440         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1441                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1442         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1443                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1444                                        ip_rt_min_advmss);
1445         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1446                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1447 
1448 #ifdef CONFIG_NET_CLS_ROUTE
1449 #ifdef CONFIG_IP_MULTIPLE_TABLES
1450         set_class_tag(rt, fib_rules_tclass(res));
1451 #endif
1452         set_class_tag(rt, itag);
1453 #endif
1454         rt->rt_type = res->type;
1455 }
1456 
1457 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1458                                 u8 tos, struct net_device *dev, int our)
1459 {
1460         unsigned hash;
1461         struct rtable *rth;
1462         u32 spec_dst;
1463         struct in_device *in_dev = in_dev_get(dev);
1464         u32 itag = 0;
1465 
1466         /* Primary sanity checks. */
1467 
1468         if (in_dev == NULL)
1469                 return -EINVAL;
1470 
1471         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1472             skb->protocol != htons(ETH_P_IP))
1473                 goto e_inval;
1474 
1475         if (ZERONET(saddr)) {
1476                 if (!LOCAL_MCAST(daddr))
1477                         goto e_inval;
1478                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1479         } else if (fib_validate_source(saddr, 0, tos, 0,
1480                                         dev, &spec_dst, &itag) < 0)
1481                 goto e_inval;
1482 
1483         rth = dst_alloc(&ipv4_dst_ops);
1484         if (!rth)
1485                 goto e_nobufs;
1486 
1487         rth->u.dst.output= ip_rt_bug;
1488 
1489         atomic_set(&rth->u.dst.__refcnt, 1);
1490         rth->u.dst.flags= DST_HOST;
1491         if (in_dev->cnf.no_policy)
1492                 rth->u.dst.flags |= DST_NOPOLICY;
1493         rth->fl.fl4_dst = daddr;
1494         rth->rt_dst     = daddr;
1495         rth->fl.fl4_tos = tos;
1496 #ifdef CONFIG_IP_ROUTE_FWMARK
1497         rth->fl.fl4_fwmark= skb->nfmark;
1498 #endif
1499         rth->fl.fl4_src = saddr;
1500         rth->rt_src     = saddr;
1501 #ifdef CONFIG_NET_CLS_ROUTE
1502         rth->u.dst.tclassid = itag;
1503 #endif
1504         rth->rt_iif     =
1505         rth->fl.iif     = dev->ifindex;
1506         rth->u.dst.dev  = &loopback_dev;
1507         dev_hold(rth->u.dst.dev);
1508         rth->idev       = in_dev_get(rth->u.dst.dev);
1509         rth->fl.oif     = 0;
1510         rth->rt_gateway = daddr;
1511         rth->rt_spec_dst= spec_dst;
1512         rth->rt_type    = RTN_MULTICAST;
1513         rth->rt_flags   = RTCF_MULTICAST;
1514         if (our) {
1515                 rth->u.dst.input= ip_local_deliver;
1516                 rth->rt_flags |= RTCF_LOCAL;
1517         }
1518 
1519 #ifdef CONFIG_IP_MROUTE
1520         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1521                 rth->u.dst.input = ip_mr_input;
1522 #endif
1523         RT_CACHE_STAT_INC(in_slow_mc);
1524 
1525         in_dev_put(in_dev);
1526         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1527         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1528 
1529 e_nobufs:
1530         in_dev_put(in_dev);
1531         return -ENOBUFS;
1532 
1533 e_inval:
1534         in_dev_put(in_dev);
1535         return -EINVAL;
1536 }
1537 
1538 /*
1539  *      NOTE. We drop all the packets that has local source
1540  *      addresses, because every properly looped back packet
1541  *      must have correct destination already attached by output routine.
1542  *
1543  *      Such approach solves two big problems:
1544  *      1. Not simplex devices are handled properly.
1545  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1546  */
1547 
1548 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1549                         u8 tos, struct net_device *dev)
1550 {
1551         struct fib_result res;
1552         struct in_device *in_dev = in_dev_get(dev);
1553         struct in_device *out_dev = NULL;
1554         struct flowi fl = { .nl_u = { .ip4_u =
1555                                       { .daddr = daddr,
1556                                         .saddr = saddr,
1557                                         .tos = tos,
1558                                         .scope = RT_SCOPE_UNIVERSE,
1559 #ifdef CONFIG_IP_ROUTE_FWMARK
1560                                         .fwmark = skb->nfmark
1561 #endif
1562                                       } },
1563                             .iif = dev->ifindex };
1564         unsigned        flags = 0;
1565         u32             itag = 0;
1566         struct rtable * rth;
1567         unsigned        hash;
1568         u32             spec_dst;
1569         int             err = -EINVAL;
1570         int             free_res = 0;
1571 
1572         /* IP on this device is disabled. */
1573 
1574         if (!in_dev)
1575                 goto out;
1576 
1577         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1578 
1579         /* Check for the most weird martians, which can be not detected
1580            by fib_lookup.
1581          */
1582 
1583         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1584                 goto martian_source;
1585 
1586         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1587                 goto brd_input;
1588 
1589         /* Accept zero addresses only to limited broadcast;
1590          * I even do not know to fix it or not. Waiting for complains :-)
1591          */
1592         if (ZERONET(saddr))
1593                 goto martian_source;
1594 
1595         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1596                 goto martian_destination;
1597 
1598         /*
1599          *      Now we are ready to route packet.
1600          */
1601         if ((err = fib_lookup(&fl, &res)) != 0) {
1602                 if (!IN_DEV_FORWARD(in_dev))
1603                         goto e_inval;
1604                 goto no_route;
1605         }
1606         free_res = 1;
1607 
1608         RT_CACHE_STAT_INC(in_slow_tot);
1609 
1610         if (res.type == RTN_BROADCAST)
1611                 goto brd_input;
1612 
1613         if (res.type == RTN_LOCAL) {
1614                 int result;
1615                 result = fib_validate_source(saddr, daddr, tos,
1616                                              loopback_dev.ifindex,
1617                                              dev, &spec_dst, &itag);
1618                 if (result < 0)
1619                         goto martian_source;
1620                 if (result)
1621                         flags |= RTCF_DIRECTSRC;
1622                 spec_dst = daddr;
1623                 goto local_input;
1624         }
1625 
1626         if (!IN_DEV_FORWARD(in_dev))
1627                 goto e_inval;
1628         if (res.type != RTN_UNICAST)
1629                 goto martian_destination;
1630 
1631 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1632         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1633                 fib_select_multipath(&fl, &res);
1634 #endif
1635         out_dev = in_dev_get(FIB_RES_DEV(res));
1636         if (out_dev == NULL) {
1637                 if (net_ratelimit())
1638                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1639                                          "Please, report\n");
1640                 goto e_inval;
1641         }
1642 
1643         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1644                                   &spec_dst, &itag);
1645         if (err < 0)
1646                 goto martian_source;
1647 
1648         if (err)
1649                 flags |= RTCF_DIRECTSRC;
1650 
1651         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1652             (IN_DEV_SHARED_MEDIA(out_dev) ||
1653              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1654                 flags |= RTCF_DOREDIRECT;
1655 
1656         if (skb->protocol != htons(ETH_P_IP)) {
1657                 /* Not IP (i.e. ARP). Do not create route, if it is
1658                  * invalid for proxy arp. DNAT routes are always valid.
1659                  */
1660                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1661                         goto e_inval;
1662         }
1663 
1664         rth = dst_alloc(&ipv4_dst_ops);
1665         if (!rth)
1666                 goto e_nobufs;
1667 
1668         atomic_set(&rth->u.dst.__refcnt, 1);
1669         rth->u.dst.flags= DST_HOST;
1670         if (in_dev->cnf.no_policy)
1671                 rth->u.dst.flags |= DST_NOPOLICY;
1672         if (in_dev->cnf.no_xfrm)
1673                 rth->u.dst.flags |= DST_NOXFRM;
1674         rth->fl.fl4_dst = daddr;
1675         rth->rt_dst     = daddr;
1676         rth->fl.fl4_tos = tos;
1677 #ifdef CONFIG_IP_ROUTE_FWMARK
1678         rth->fl.fl4_fwmark= skb->nfmark;
1679 #endif
1680         rth->fl.fl4_src = saddr;
1681         rth->rt_src     = saddr;
1682         rth->rt_gateway = daddr;
1683         rth->rt_iif     =
1684         rth->fl.iif     = dev->ifindex;
1685         rth->u.dst.dev  = out_dev->dev;
1686         dev_hold(rth->u.dst.dev);
1687         rth->idev       = in_dev_get(rth->u.dst.dev);
1688         rth->fl.oif     = 0;
1689         rth->rt_spec_dst= spec_dst;
1690 
1691         rth->u.dst.input = ip_forward;
1692         rth->u.dst.output = ip_output;
1693 
1694         rt_set_nexthop(rth, &res, itag);
1695 
1696         rth->rt_flags = flags;
1697 
1698 intern:
1699         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1700 done:
1701         in_dev_put(in_dev);
1702         if (out_dev)
1703                 in_dev_put(out_dev);
1704         if (free_res)
1705                 fib_res_put(&res);
1706 out:    return err;
1707 
1708 brd_input:
1709         if (skb->protocol != htons(ETH_P_IP))
1710                 goto e_inval;
1711 
1712         if (ZERONET(saddr))
1713                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1714         else {
1715                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1716                                           &itag);
1717                 if (err < 0)
1718                         goto martian_source;
1719                 if (err)
1720                         flags |= RTCF_DIRECTSRC;
1721         }
1722         flags |= RTCF_BROADCAST;
1723         res.type = RTN_BROADCAST;
1724         RT_CACHE_STAT_INC(in_brd);
1725 
1726 local_input:
1727         rth = dst_alloc(&ipv4_dst_ops);
1728         if (!rth)
1729                 goto e_nobufs;
1730 
1731         rth->u.dst.output= ip_rt_bug;
1732 
1733         atomic_set(&rth->u.dst.__refcnt, 1);
1734         rth->u.dst.flags= DST_HOST;
1735         if (in_dev->cnf.no_policy)
1736                 rth->u.dst.flags |= DST_NOPOLICY;
1737         rth->fl.fl4_dst = daddr;
1738         rth->rt_dst     = daddr;
1739         rth->fl.fl4_tos = tos;
1740 #ifdef CONFIG_IP_ROUTE_FWMARK
1741         rth->fl.fl4_fwmark= skb->nfmark;
1742 #endif
1743         rth->fl.fl4_src = saddr;
1744         rth->rt_src     = saddr;
1745 #ifdef CONFIG_NET_CLS_ROUTE
1746         rth->u.dst.tclassid = itag;
1747 #endif
1748         rth->rt_iif     =
1749         rth->fl.iif     = dev->ifindex;
1750         rth->u.dst.dev  = &loopback_dev;
1751         dev_hold(rth->u.dst.dev);
1752         rth->idev       = in_dev_get(rth->u.dst.dev);
1753         rth->rt_gateway = daddr;
1754         rth->rt_spec_dst= spec_dst;
1755         rth->u.dst.input= ip_local_deliver;
1756         rth->rt_flags   = flags|RTCF_LOCAL;
1757         if (res.type == RTN_UNREACHABLE) {
1758                 rth->u.dst.input= ip_error;
1759                 rth->u.dst.error= -err;
1760                 rth->rt_flags   &= ~RTCF_LOCAL;
1761         }
1762         rth->rt_type    = res.type;
1763         goto intern;
1764 
1765 no_route:
1766         RT_CACHE_STAT_INC(in_no_route);
1767         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1768         res.type = RTN_UNREACHABLE;
1769         goto local_input;
1770 
1771         /*
1772          *      Do not cache martian addresses: they should be logged (RFC1812)
1773          */
1774 martian_destination:
1775         RT_CACHE_STAT_INC(in_martian_dst);
1776 #ifdef CONFIG_IP_ROUTE_VERBOSE
1777         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1778                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1779                         "%u.%u.%u.%u, dev %s\n",
1780                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1781 #endif
1782 e_inval:
1783         err = -EINVAL;
1784         goto done;
1785 
1786 e_nobufs:
1787         err = -ENOBUFS;
1788         goto done;
1789 
1790 martian_source:
1791 
1792         RT_CACHE_STAT_INC(in_martian_src);
1793 #ifdef CONFIG_IP_ROUTE_VERBOSE
1794         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1795                 /*
1796                  *      RFC1812 recommendation, if source is martian,
1797                  *      the only hint is MAC header.
1798                  */
1799                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1800                         "%u.%u.%u.%u, on dev %s\n",
1801                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1802                 if (dev->hard_header_len) {
1803                         int i;
1804                         unsigned char *p = skb->mac.raw;
1805                         printk(KERN_WARNING "ll header: ");
1806                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1807                                 printk("%02x", *p);
1808                                 if (i < (dev->hard_header_len - 1))
1809                                         printk(":");
1810                         }
1811                         printk("\n");
1812                 }
1813         }
1814 #endif
1815         goto e_inval;
1816 }
1817 
1818 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1819                    u8 tos, struct net_device *dev)
1820 {
1821         struct rtable * rth;
1822         unsigned        hash;
1823         int iif = dev->ifindex;
1824 
1825         tos &= IPTOS_RT_MASK;
1826         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1827 
1828         rcu_read_lock();
1829         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1830              rth = rcu_dereference(rth->u.rt_next)) {
1831                 if (rth->fl.fl4_dst == daddr &&
1832                     rth->fl.fl4_src == saddr &&
1833                     rth->fl.iif == iif &&
1834                     rth->fl.oif == 0 &&
1835 #ifdef CONFIG_IP_ROUTE_FWMARK
1836                     rth->fl.fl4_fwmark == skb->nfmark &&
1837 #endif
1838                     rth->fl.fl4_tos == tos) {
1839                         rth->u.dst.lastuse = jiffies;
1840                         dst_hold(&rth->u.dst);
1841                         rth->u.dst.__use++;
1842                         RT_CACHE_STAT_INC(in_hit);
1843                         rcu_read_unlock();
1844                         skb->dst = (struct dst_entry*)rth;
1845                         return 0;
1846                 }
1847                 RT_CACHE_STAT_INC(in_hlist_search);
1848         }
1849         rcu_read_unlock();
1850 
1851         /* Multicast recognition logic is moved from route cache to here.
1852            The problem was that too many Ethernet cards have broken/missing
1853            hardware multicast filters :-( As result the host on multicasting
1854            network acquires a lot of useless route cache entries, sort of
1855            SDR messages from all the world. Now we try to get rid of them.
1856            Really, provided software IP multicast filter is organized
1857            reasonably (at least, hashed), it does not result in a slowdown
1858            comparing with route cache reject entries.
1859            Note, that multicast routers are not affected, because
1860            route cache entry is created eventually.
1861          */
1862         if (MULTICAST(daddr)) {
1863                 struct in_device *in_dev;
1864 
1865                 rcu_read_lock();
1866                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1867                         int our = ip_check_mc(in_dev, daddr, saddr,
1868                                 skb->nh.iph->protocol);
1869                         if (our
1870 #ifdef CONFIG_IP_MROUTE
1871                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1872 #endif
1873                             ) {
1874                                 rcu_read_unlock();
1875                                 return ip_route_input_mc(skb, daddr, saddr,
1876                                                          tos, dev, our);
1877                         }
1878                 }
1879                 rcu_read_unlock();
1880                 return -EINVAL;
1881         }
1882         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1883 }
1884 
1885 /*
1886  * Major route resolver routine.
1887  */
1888 
1889 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1890 {
1891         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1892         struct flowi fl = { .nl_u = { .ip4_u =
1893                                       { .daddr = oldflp->fl4_dst,
1894                                         .saddr = oldflp->fl4_src,
1895                                         .tos = tos & IPTOS_RT_MASK,
1896                                         .scope = ((tos & RTO_ONLINK) ?
1897                                                   RT_SCOPE_LINK :
1898                                                   RT_SCOPE_UNIVERSE),
1899 #ifdef CONFIG_IP_ROUTE_FWMARK
1900                                         .fwmark = oldflp->fl4_fwmark
1901 #endif
1902                                       } },
1903                             .iif = loopback_dev.ifindex,
1904                             .oif = oldflp->oif };
1905         struct fib_result res;
1906         unsigned flags = 0;
1907         struct rtable *rth;
1908         struct net_device *dev_out = NULL;
1909         struct in_device *in_dev = NULL;
1910         unsigned hash;
1911         int free_res = 0;
1912         int err;
1913 
1914         res.fi          = NULL;
1915 #ifdef CONFIG_IP_MULTIPLE_TABLES
1916         res.r           = NULL;
1917 #endif
1918 
1919         if (oldflp->fl4_src) {
1920                 err = -EINVAL;
1921                 if (MULTICAST(oldflp->fl4_src) ||
1922                     BADCLASS(oldflp->fl4_src) ||
1923                     ZERONET(oldflp->fl4_src))
1924                         goto out;
1925 
1926                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1927                 dev_out = ip_dev_find(oldflp->fl4_src);
1928                 if (dev_out == NULL)
1929                         goto out;
1930 
1931                 /* I removed check for oif == dev_out->oif here.
1932                    It was wrong for two reasons:
1933                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1934                       assigned to multiple interfaces.
1935                    2. Moreover, we are allowed to send packets with saddr
1936                       of another iface. --ANK
1937                  */
1938 
1939                 if (oldflp->oif == 0
1940                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1941                         /* Special hack: user can direct multicasts
1942                            and limited broadcast via necessary interface
1943                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1944                            This hack is not just for fun, it allows
1945                            vic,vat and friends to work.
1946                            They bind socket to loopback, set ttl to zero
1947                            and expect that it will work.
1948                            From the viewpoint of routing cache they are broken,
1949                            because we are not allowed to build multicast path
1950                            with loopback source addr (look, routing cache
1951                            cannot know, that ttl is zero, so that packet
1952                            will not leave this host and route is valid).
1953                            Luckily, this hack is good workaround.
1954                          */
1955 
1956                         fl.oif = dev_out->ifindex;
1957                         goto make_route;
1958                 }
1959                 if (dev_out)
1960                         dev_put(dev_out);
1961                 dev_out = NULL;
1962         }
1963         if (oldflp->oif) {
1964                 dev_out = dev_get_by_index(oldflp->oif);
1965                 err = -ENODEV;
1966                 if (dev_out == NULL)
1967                         goto out;
1968                 if (__in_dev_get(dev_out) == NULL) {
1969                         dev_put(dev_out);
1970                         goto out;       /* Wrong error code */
1971                 }
1972 
1973                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1974                         if (!fl.fl4_src)
1975                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1976                                                               RT_SCOPE_LINK);
1977                         goto make_route;
1978                 }
1979                 if (!fl.fl4_src) {
1980                         if (MULTICAST(oldflp->fl4_dst))
1981                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1982                                                               fl.fl4_scope);
1983                         else if (!oldflp->fl4_dst)
1984                                 fl.fl4_src = inet_select_addr(dev_out, 0,
1985                                                               RT_SCOPE_HOST);
1986                 }
1987         }
1988 
1989         if (!fl.fl4_dst) {
1990                 fl.fl4_dst = fl.fl4_src;
1991                 if (!fl.fl4_dst)
1992                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
1993                 if (dev_out)
1994                         dev_put(dev_out);
1995                 dev_out = &loopback_dev;
1996                 dev_hold(dev_out);
1997                 fl.oif = loopback_dev.ifindex;
1998                 res.type = RTN_LOCAL;
1999                 flags |= RTCF_LOCAL;
2000                 goto make_route;
2001         }
2002 
2003         if (fib_lookup(&fl, &res)) {
2004                 res.fi = NULL;
2005                 if (oldflp->oif) {
2006                         /* Apparently, routing tables are wrong. Assume,
2007                            that the destination is on link.
2008 
2009                            WHY? DW.
2010                            Because we are allowed to send to iface
2011                            even if it has NO routes and NO assigned
2012                            addresses. When oif is specified, routing
2013                            tables are looked up with only one purpose:
2014                            to catch if destination is gatewayed, rather than
2015                            direct. Moreover, if MSG_DONTROUTE is set,
2016                            we send packet, ignoring both routing tables
2017                            and ifaddr state. --ANK
2018 
2019 
2020                            We could make it even if oif is unknown,
2021                            likely IPv6, but we do not.
2022                          */
2023 
2024                         if (fl.fl4_src == 0)
2025                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2026                                                               RT_SCOPE_LINK);
2027                         res.type = RTN_UNICAST;
2028                         goto make_route;
2029                 }
2030                 if (dev_out)
2031                         dev_put(dev_out);
2032                 err = -ENETUNREACH;
2033                 goto out;
2034         }
2035         free_res = 1;
2036 
2037         if (res.type == RTN_LOCAL) {
2038                 if (!fl.fl4_src)
2039                         fl.fl4_src = fl.fl4_dst;
2040                 if (dev_out)
2041                         dev_put(dev_out);
2042                 dev_out = &loopback_dev;
2043                 dev_hold(dev_out);
2044                 fl.oif = dev_out->ifindex;
2045                 if (res.fi)
2046                         fib_info_put(res.fi);
2047                 res.fi = NULL;
2048                 flags |= RTCF_LOCAL;
2049                 goto make_route;
2050         }
2051 
2052 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2053         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2054                 fib_select_multipath(&fl, &res);
2055         else
2056 #endif
2057         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2058                 fib_select_default(&fl, &res);
2059 
2060         if (!fl.fl4_src)
2061                 fl.fl4_src = FIB_RES_PREFSRC(res);
2062 
2063         if (dev_out)
2064                 dev_put(dev_out);
2065         dev_out = FIB_RES_DEV(res);
2066         dev_hold(dev_out);
2067         fl.oif = dev_out->ifindex;
2068 
2069 make_route:
2070         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2071                 goto e_inval;
2072 
2073         if (fl.fl4_dst == 0xFFFFFFFF)
2074                 res.type = RTN_BROADCAST;
2075         else if (MULTICAST(fl.fl4_dst))
2076                 res.type = RTN_MULTICAST;
2077         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2078                 goto e_inval;
2079 
2080         if (dev_out->flags & IFF_LOOPBACK)
2081                 flags |= RTCF_LOCAL;
2082 
2083         in_dev = in_dev_get(dev_out);
2084         if (!in_dev)
2085                 goto e_inval;
2086 
2087         if (res.type == RTN_BROADCAST) {
2088                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089                 if (res.fi) {
2090                         fib_info_put(res.fi);
2091                         res.fi = NULL;
2092                 }
2093         } else if (res.type == RTN_MULTICAST) {
2094                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2095                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2096                         flags &= ~RTCF_LOCAL;
2097                 /* If multicast route do not exist use
2098                    default one, but do not gateway in this case.
2099                    Yes, it is hack.
2100                  */
2101                 if (res.fi && res.prefixlen < 4) {
2102                         fib_info_put(res.fi);
2103                         res.fi = NULL;
2104                 }
2105         }
2106 
2107         rth = dst_alloc(&ipv4_dst_ops);
2108         if (!rth)
2109                 goto e_nobufs;
2110 
2111         atomic_set(&rth->u.dst.__refcnt, 1);
2112         rth->u.dst.flags= DST_HOST;
2113         if (in_dev->cnf.no_xfrm)
2114                 rth->u.dst.flags |= DST_NOXFRM;
2115         if (in_dev->cnf.no_policy)
2116                 rth->u.dst.flags |= DST_NOPOLICY;
2117         rth->fl.fl4_dst = oldflp->fl4_dst;
2118         rth->fl.fl4_tos = tos;
2119         rth->fl.fl4_src = oldflp->fl4_src;
2120         rth->fl.oif     = oldflp->oif;
2121 #ifdef CONFIG_IP_ROUTE_FWMARK
2122         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2123 #endif
2124         rth->rt_dst     = fl.fl4_dst;
2125         rth->rt_src     = fl.fl4_src;
2126         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2127         rth->u.dst.dev  = dev_out;
2128         dev_hold(dev_out);
2129         rth->idev       = in_dev_get(dev_out);
2130         rth->rt_gateway = fl.fl4_dst;
2131         rth->rt_spec_dst= fl.fl4_src;
2132 
2133         rth->u.dst.output=ip_output;
2134 
2135         RT_CACHE_STAT_INC(out_slow_tot);
2136 
2137         if (flags & RTCF_LOCAL) {
2138                 rth->u.dst.input = ip_local_deliver;
2139                 rth->rt_spec_dst = fl.fl4_dst;
2140         }
2141         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2142                 rth->rt_spec_dst = fl.fl4_src;
2143                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2144                         rth->u.dst.output = ip_mc_output;
2145                         RT_CACHE_STAT_INC(out_slow_mc);
2146                 }
2147 #ifdef CONFIG_IP_MROUTE
2148                 if (res.type == RTN_MULTICAST) {
2149                         if (IN_DEV_MFORWARD(in_dev) &&
2150                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2151                                 rth->u.dst.input = ip_mr_input;
2152                                 rth->u.dst.output = ip_mc_output;
2153                         }
2154                 }
2155 #endif
2156         }
2157 
2158         rt_set_nexthop(rth, &res, 0);
2159         
2160 
2161         rth->rt_flags = flags;
2162 
2163         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2164         err = rt_intern_hash(hash, rth, rp);
2165 done:
2166         if (free_res)
2167                 fib_res_put(&res);
2168         if (dev_out)
2169                 dev_put(dev_out);
2170         if (in_dev)
2171                 in_dev_put(in_dev);
2172 out:    return err;
2173 
2174 e_inval:
2175         err = -EINVAL;
2176         goto done;
2177 e_nobufs:
2178         err = -ENOBUFS;
2179         goto done;
2180 }
2181 
2182 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2183 {
2184         unsigned hash;
2185         struct rtable *rth;
2186 
2187         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2188 
2189         rcu_read_lock_bh();
2190         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2191                 rth = rcu_dereference(rth->u.rt_next)) {
2192                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2193                     rth->fl.fl4_src == flp->fl4_src &&
2194                     rth->fl.iif == 0 &&
2195                     rth->fl.oif == flp->oif &&
2196 #ifdef CONFIG_IP_ROUTE_FWMARK
2197                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2198 #endif
2199                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2200                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2201                         rth->u.dst.lastuse = jiffies;
2202                         dst_hold(&rth->u.dst);
2203                         rth->u.dst.__use++;
2204                         RT_CACHE_STAT_INC(out_hit);
2205                         rcu_read_unlock_bh();
2206                         *rp = rth;
2207                         return 0;
2208                 }
2209                 RT_CACHE_STAT_INC(out_hlist_search);
2210         }
2211         rcu_read_unlock_bh();
2212 
2213         return ip_route_output_slow(rp, flp);
2214 }
2215 
2216 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2217 {
2218         int err;
2219 
2220         if ((err = __ip_route_output_key(rp, flp)) != 0)
2221                 return err;
2222 
2223         if (flp->proto) {
2224                 if (!flp->fl4_src)
2225                         flp->fl4_src = (*rp)->rt_src;
2226                 if (!flp->fl4_dst)
2227                         flp->fl4_dst = (*rp)->rt_dst;
2228                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2229         }
2230 
2231         return 0;
2232 }
2233 
2234 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2235 {
2236         return ip_route_output_flow(rp, flp, NULL, 0);
2237 }
2238 
2239 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2240                         int nowait)
2241 {
2242         struct rtable *rt = (struct rtable*)skb->dst;
2243         struct rtmsg *r;
2244         struct nlmsghdr  *nlh;
2245         unsigned char    *b = skb->tail;
2246         struct rta_cacheinfo ci;
2247 #ifdef CONFIG_IP_MROUTE
2248         struct rtattr *eptr;
2249 #endif
2250         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2251         r = NLMSG_DATA(nlh);
2252         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2253         r->rtm_family    = AF_INET;
2254         r->rtm_dst_len  = 32;
2255         r->rtm_src_len  = 0;
2256         r->rtm_tos      = rt->fl.fl4_tos;
2257         r->rtm_table    = RT_TABLE_MAIN;
2258         r->rtm_type     = rt->rt_type;
2259         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2260         r->rtm_protocol = RTPROT_UNSPEC;
2261         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2262         if (rt->rt_flags & RTCF_NOTIFY)
2263                 r->rtm_flags |= RTM_F_NOTIFY;
2264         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2265         if (rt->fl.fl4_src) {
2266                 r->rtm_src_len = 32;
2267                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2268         }
2269         if (rt->u.dst.dev)
2270                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2271 #ifdef CONFIG_NET_CLS_ROUTE
2272         if (rt->u.dst.tclassid)
2273                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2274 #endif
2275         if (rt->fl.iif)
2276                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2277         else if (rt->rt_src != rt->fl.fl4_src)
2278                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2279         if (rt->rt_dst != rt->rt_gateway)
2280                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2281         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2282                 goto rtattr_failure;
2283         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2284         ci.rta_used     = rt->u.dst.__use;
2285         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2286         if (rt->u.dst.expires)
2287                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2288         else
2289                 ci.rta_expires = 0;
2290         ci.rta_error    = rt->u.dst.error;
2291         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2292         if (rt->peer) {
2293                 ci.rta_id = rt->peer->ip_id_count;
2294                 if (rt->peer->tcp_ts_stamp) {
2295                         ci.rta_ts = rt->peer->tcp_ts;
2296                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2297                 }
2298         }
2299 #ifdef CONFIG_IP_MROUTE
2300         eptr = (struct rtattr*)skb->tail;
2301 #endif
2302         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2303         if (rt->fl.iif) {
2304 #ifdef CONFIG_IP_MROUTE
2305                 u32 dst = rt->rt_dst;
2306 
2307                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2308                     ipv4_devconf.mc_forwarding) {
2309                         int err = ipmr_get_route(skb, r, nowait);
2310                         if (err <= 0) {
2311                                 if (!nowait) {
2312                                         if (err == 0)
2313                                                 return 0;
2314                                         goto nlmsg_failure;
2315                                 } else {
2316                                         if (err == -EMSGSIZE)
2317                                                 goto nlmsg_failure;
2318                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2319                                 }
2320                         }
2321                 } else
2322 #endif
2323                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2324         }
2325 
2326         nlh->nlmsg_len = skb->tail - b;
2327         return skb->len;
2328 
2329 nlmsg_failure:
2330 rtattr_failure:
2331         skb_trim(skb, b - skb->data);
2332         return -1;
2333 }
2334 
2335 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2336 {
2337         struct rtattr **rta = arg;
2338         struct rtmsg *rtm = NLMSG_DATA(nlh);
2339         struct rtable *rt = NULL;
2340         u32 dst = 0;
2341         u32 src = 0;
2342         int iif = 0;
2343         int err = -ENOBUFS;
2344         struct sk_buff *skb;
2345 
2346         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2347         if (!skb)
2348                 goto out;
2349 
2350         /* Reserve room for dummy headers, this skb can pass
2351            through good chunk of routing engine.
2352          */
2353         skb->mac.raw = skb->data;
2354         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2355 
2356         if (rta[RTA_SRC - 1])
2357                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2358         if (rta[RTA_DST - 1])
2359                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2360         if (rta[RTA_IIF - 1])
2361                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2362 
2363         if (iif) {
2364                 struct net_device *dev = __dev_get_by_index(iif);
2365                 err = -ENODEV;
2366                 if (!dev)
2367                         goto out_free;
2368                 skb->protocol   = htons(ETH_P_IP);
2369                 skb->dev        = dev;
2370                 local_bh_disable();
2371                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2372                 local_bh_enable();
2373                 rt = (struct rtable*)skb->dst;
2374                 if (!err && rt->u.dst.error)
2375                         err = -rt->u.dst.error;
2376         } else {
2377                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2378                                                          .saddr = src,
2379                                                          .tos = rtm->rtm_tos } } };
2380                 int oif = 0;
2381                 if (rta[RTA_OIF - 1])
2382                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2383                 fl.oif = oif;
2384                 err = ip_route_output_key(&rt, &fl);
2385         }
2386         if (err)
2387                 goto out_free;
2388 
2389         skb->dst = &rt->u.dst;
2390         if (rtm->rtm_flags & RTM_F_NOTIFY)
2391                 rt->rt_flags |= RTCF_NOTIFY;
2392 
2393         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2394 
2395         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2396                                 RTM_NEWROUTE, 0);
2397         if (!err)
2398                 goto out_free;
2399         if (err < 0) {
2400                 err = -EMSGSIZE;
2401                 goto out_free;
2402         }
2403 
2404         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2405         if (err > 0)
2406                 err = 0;
2407 out:    return err;
2408 
2409 out_free:
2410         kfree_skb(skb);
2411         goto out;
2412 }
2413 
2414 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2415 {
2416         struct rtable *rt;
2417         int h, s_h;
2418         int idx, s_idx;
2419 
2420         s_h = cb->args[0];
2421         s_idx = idx = cb->args[1];
2422         for (h = 0; h <= rt_hash_mask; h++) {
2423                 if (h < s_h) continue;
2424                 if (h > s_h)
2425                         s_idx = 0;
2426                 rcu_read_lock_bh();
2427                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2428                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2429                         if (idx < s_idx)
2430                                 continue;
2431                         skb->dst = dst_clone(&rt->u.dst);
2432                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2433                                          cb->nlh->nlmsg_seq,
2434                                          RTM_NEWROUTE, 1) <= 0) {
2435                                 dst_release(xchg(&skb->dst, NULL));
2436                                 rcu_read_unlock_bh();
2437                                 goto done;
2438                         }
2439                         dst_release(xchg(&skb->dst, NULL));
2440                 }
2441                 rcu_read_unlock_bh();
2442         }
2443 
2444 done:
2445         cb->args[0] = h;
2446         cb->args[1] = idx;
2447         return skb->len;
2448 }
2449 
2450 void ip_rt_multicast_event(struct in_device *in_dev)
2451 {
2452         rt_cache_flush(0);
2453 }
2454 
2455 #ifdef CONFIG_SYSCTL
2456 static int flush_delay;
2457 
2458 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2459                                         struct file *filp, void __user *buffer,
2460                                         size_t *lenp, loff_t *ppos)
2461 {
2462         if (write) {
2463                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2464                 rt_cache_flush(flush_delay);
2465                 return 0;
2466         } 
2467 
2468         return -EINVAL;
2469 }
2470 
2471 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2472                                                 int __user *name,
2473                                                 int nlen,
2474                                                 void __user *oldval,
2475                                                 size_t __user *oldlenp,
2476                                                 void __user *newval,
2477                                                 size_t newlen,
2478                                                 void **context)
2479 {
2480         int delay;
2481         if (newlen != sizeof(int))
2482                 return -EINVAL;
2483         if (get_user(delay, (int __user *)newval))
2484                 return -EFAULT; 
2485         rt_cache_flush(delay); 
2486         return 0;
2487 }
2488 
2489 ctl_table ipv4_route_table[] = {
2490         {
2491                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2492                 .procname       = "flush",
2493                 .data           = &flush_delay,
2494                 .maxlen         = sizeof(int),
2495                 .mode           = 0644,
2496                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2497                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2498         },
2499         {
2500                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2501                 .procname       = "min_delay",
2502                 .data           = &ip_rt_min_delay,
2503                 .maxlen         = sizeof(int),
2504                 .mode           = 0644,
2505                 .proc_handler   = &proc_dointvec_jiffies,
2506                 .strategy       = &sysctl_jiffies,
2507         },
2508         {
2509                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2510                 .procname       = "max_delay",
2511                 .data           = &ip_rt_max_delay,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = &proc_dointvec_jiffies,
2515                 .strategy       = &sysctl_jiffies,
2516         },
2517         {
2518                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2519                 .procname       = "gc_thresh",
2520                 .data           = &ipv4_dst_ops.gc_thresh,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = &proc_dointvec,
2524         },
2525         {
2526                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2527                 .procname       = "max_size",
2528                 .data           = &ip_rt_max_size,
2529                 .maxlen         = sizeof(int),
2530                 .mode           = 0644,
2531                 .proc_handler   = &proc_dointvec,
2532         },
2533         {
2534                 /*  Deprecated. Use gc_min_interval_ms */
2535  
2536                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2537                 .procname       = "gc_min_interval",
2538                 .data           = &ip_rt_gc_min_interval,
2539                 .maxlen         = sizeof(int),
2540                 .mode           = 0644,
2541                 .proc_handler   = &proc_dointvec_jiffies,
2542                 .strategy       = &sysctl_jiffies,
2543         },
2544         {
2545                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2546                 .procname       = "gc_min_interval_ms",
2547                 .data           = &ip_rt_gc_min_interval,
2548                 .maxlen         = sizeof(int),
2549                 .mode           = 0644,
2550                 .proc_handler   = &proc_dointvec_ms_jiffies,
2551                 .strategy       = &sysctl_ms_jiffies,
2552         },
2553         {
2554                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2555                 .procname       = "gc_timeout",
2556                 .data           = &ip_rt_gc_timeout,
2557                 .maxlen         = sizeof(int),
2558                 .mode           = 0644,
2559                 .proc_handler   = &proc_dointvec_jiffies,
2560                 .strategy       = &sysctl_jiffies,
2561         },
2562         {
2563                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2564                 .procname       = "gc_interval",
2565                 .data           = &ip_rt_gc_interval,
2566                 .maxlen         = sizeof(int),
2567                 .mode           = 0644,
2568                 .proc_handler   = &proc_dointvec_jiffies,
2569                 .strategy       = &sysctl_jiffies,
2570         },
2571         {
2572                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2573                 .procname       = "redirect_load",
2574                 .data           = &ip_rt_redirect_load,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = &proc_dointvec,
2578         },
2579         {
2580                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2581                 .procname       = "redirect_number",
2582                 .data           = &ip_rt_redirect_number,
2583                 .maxlen         = sizeof(int),
2584                 .mode           = 0644,
2585                 .proc_handler   = &proc_dointvec,
2586         },
2587         {
2588                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2589                 .procname       = "redirect_silence",
2590                 .data           = &ip_rt_redirect_silence,
2591                 .maxlen         = sizeof(int),
2592                 .mode           = 0644,
2593                 .proc_handler   = &proc_dointvec,
2594         },
2595         {
2596                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2597                 .procname       = "error_cost",
2598                 .data           = &ip_rt_error_cost,
2599                 .maxlen         = sizeof(int),
2600                 .mode           = 0644,
2601                 .proc_handler   = &proc_dointvec,
2602         },
2603         {
2604                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2605                 .procname       = "error_burst",
2606                 .data           = &ip_rt_error_burst,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = &proc_dointvec,
2610         },
2611         {
2612                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2613                 .procname       = "gc_elasticity",
2614                 .data           = &ip_rt_gc_elasticity,
2615                 .maxlen         = sizeof(int),
2616                 .mode           = 0644,
2617                 .proc_handler   = &proc_dointvec,
2618         },
2619         {
2620                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2621                 .procname       = "mtu_expires",
2622                 .data           = &ip_rt_mtu_expires,
2623                 .maxlen         = sizeof(int),
2624                 .mode           = 0644,
2625                 .proc_handler   = &proc_dointvec_jiffies,
2626                 .strategy       = &sysctl_jiffies,
2627         },
2628         {
2629                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2630                 .procname       = "min_pmtu",
2631                 .data           = &ip_rt_min_pmtu,
2632                 .maxlen         = sizeof(int),
2633                 .mode           = 0644,
2634                 .proc_handler   = &proc_dointvec,
2635         },
2636         {
2637                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2638                 .procname       = "min_adv_mss",
2639                 .data           = &ip_rt_min_advmss,
2640                 .maxlen         = sizeof(int),
2641                 .mode           = 0644,
2642                 .proc_handler   = &proc_dointvec,
2643         },
2644         {
2645                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2646                 .procname       = "secret_interval",
2647                 .data           = &ip_rt_secret_interval,
2648                 .maxlen         = sizeof(int),
2649                 .mode           = 0644,
2650                 .proc_handler   = &proc_dointvec_jiffies,
2651                 .strategy       = &sysctl_jiffies,
2652         },
2653         { .ctl_name = 0 }
2654 };
2655 #endif
2656 
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658 struct ip_rt_acct *ip_rt_acct;
2659 
2660 /* This code sucks.  But you should have seen it before! --RR */
2661 
2662 /* IP route accounting ptr for this logical cpu number. */
2663 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2664 
2665 #ifdef CONFIG_PROC_FS
2666 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2667                            int length, int *eof, void *data)
2668 {
2669         unsigned int i;
2670 
2671         if ((offset & 3) || (length & 3))
2672                 return -EIO;
2673 
2674         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2675                 *eof = 1;
2676                 return 0;
2677         }
2678 
2679         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2680                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2681                 *eof = 1;
2682         }
2683 
2684         offset /= sizeof(u32);
2685 
2686         if (length > 0) {
2687                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2688                 u32 *dst = (u32 *) buffer;
2689 
2690                 /* Copy first cpu. */
2691                 *start = buffer;
2692                 memcpy(dst, src, length);
2693 
2694                 /* Add the other cpus in, one int at a time */
2695                 for_each_cpu(i) {
2696                         unsigned int j;
2697 
2698                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2699 
2700                         for (j = 0; j < length/4; j++)
2701                                 dst[j] += src[j];
2702                 }
2703         }
2704         return length;
2705 }
2706 #endif /* CONFIG_PROC_FS */
2707 #endif /* CONFIG_NET_CLS_ROUTE */
2708 
2709 static __initdata unsigned long rhash_entries;
2710 static int __init set_rhash_entries(char *str)
2711 {
2712         if (!str)
2713                 return 0;
2714         rhash_entries = simple_strtoul(str, &str, 0);
2715         return 1;
2716 }
2717 __setup("rhash_entries=", set_rhash_entries);
2718 
2719 int __init ip_rt_init(void)
2720 {
2721         int i, order, goal, rc = 0;
2722 
2723         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2724                              (jiffies ^ (jiffies >> 7)));
2725 
2726 #ifdef CONFIG_NET_CLS_ROUTE
2727         for (order = 0;
2728              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2729                 /* NOTHING */;
2730         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2731         if (!ip_rt_acct)
2732                 panic("IP: failed to allocate ip_rt_acct\n");
2733         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2734 #endif
2735 
2736         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2737                                                      sizeof(struct rtable),
2738                                                      0, SLAB_HWCACHE_ALIGN,
2739                                                      NULL, NULL);
2740 
2741         if (!ipv4_dst_ops.kmem_cachep)
2742                 panic("IP: failed to allocate ip_dst_cache\n");
2743 
2744         goal = num_physpages >> (26 - PAGE_SHIFT);
2745         if (rhash_entries)
2746                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2747         for (order = 0; (1UL << order) < goal; order++)
2748                 /* NOTHING */;
2749 
2750         do {
2751                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2752                         sizeof(struct rt_hash_bucket);
2753                 while (rt_hash_mask & (rt_hash_mask - 1))
2754                         rt_hash_mask--;
2755                 rt_hash_table = (struct rt_hash_bucket *)
2756                         __get_free_pages(GFP_ATOMIC, order);
2757         } while (rt_hash_table == NULL && --order > 0);
2758 
2759         if (!rt_hash_table)
2760                 panic("Failed to allocate IP route cache hash table\n");
2761 
2762         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2763                rt_hash_mask,
2764                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2765 
2766         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2767                 /* NOTHING */;
2768 
2769         rt_hash_mask--;
2770         for (i = 0; i <= rt_hash_mask; i++) {
2771                 spin_lock_init(&rt_hash_table[i].lock);
2772                 rt_hash_table[i].chain = NULL;
2773         }
2774 
2775         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2776         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2777 
2778         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2779         if (!rt_cache_stat)
2780                 return -ENOMEM;
2781 
2782         devinet_init();
2783         ip_fib_init();
2784 
2785         init_timer(&rt_flush_timer);
2786         rt_flush_timer.function = rt_run_flush;
2787         init_timer(&rt_periodic_timer);
2788         rt_periodic_timer.function = rt_check_expire;
2789         init_timer(&rt_secret_timer);
2790         rt_secret_timer.function = rt_secret_rebuild;
2791 
2792         /* All the timers, started at system startup tend
2793            to synchronize. Perturb it a bit.
2794          */
2795         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2796                                         ip_rt_gc_interval;
2797         add_timer(&rt_periodic_timer);
2798 
2799         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2800                 ip_rt_secret_interval;
2801         add_timer(&rt_secret_timer);
2802 
2803 #ifdef CONFIG_PROC_FS
2804         {
2805         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2806         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2807             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
2808                                              proc_net_stat))) {
2809                 free_percpu(rt_cache_stat);
2810                 return -ENOMEM;
2811         }
2812         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2813         }
2814 #ifdef CONFIG_NET_CLS_ROUTE
2815         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2816 #endif
2817 #endif
2818 #ifdef CONFIG_XFRM
2819         xfrm_init();
2820         xfrm4_init();
2821 #endif
2822         return rc;
2823 }
2824 
2825 EXPORT_SYMBOL(__ip_select_ident);
2826 EXPORT_SYMBOL(ip_route_input);
2827 EXPORT_SYMBOL(ip_route_output_key);
2828 
  This page was automatically generated by the LXR engine.