net dst: use a percpu_counter to track entries
[linux-flexiantxendom0-natty.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150
151 static struct dst_ops ipv4_dst_ops = {
152         .family =               AF_INET,
153         .protocol =             cpu_to_be16(ETH_P_IP),
154         .gc =                   rt_garbage_collect,
155         .check =                ipv4_dst_check,
156         .destroy =              ipv4_dst_destroy,
157         .ifdown =               ipv4_dst_ifdown,
158         .negative_advice =      ipv4_negative_advice,
159         .link_failure =         ipv4_link_failure,
160         .update_pmtu =          ip_rt_update_pmtu,
161         .local_out =            __ip_local_out,
162 };
163
164 #define ECN_OR_COST(class)      TC_PRIO_##class
165
166 const __u8 ip_tos2prio[16] = {
167         TC_PRIO_BESTEFFORT,
168         ECN_OR_COST(FILLER),
169         TC_PRIO_BESTEFFORT,
170         ECN_OR_COST(BESTEFFORT),
171         TC_PRIO_BULK,
172         ECN_OR_COST(BULK),
173         TC_PRIO_BULK,
174         ECN_OR_COST(BULK),
175         TC_PRIO_INTERACTIVE,
176         ECN_OR_COST(INTERACTIVE),
177         TC_PRIO_INTERACTIVE,
178         ECN_OR_COST(INTERACTIVE),
179         TC_PRIO_INTERACTIVE_BULK,
180         ECN_OR_COST(INTERACTIVE_BULK),
181         TC_PRIO_INTERACTIVE_BULK,
182         ECN_OR_COST(INTERACTIVE_BULK)
183 };
184
185
186 /*
187  * Route cache.
188  */
189
190 /* The locking scheme is rather straight forward:
191  *
192  * 1) Read-Copy Update protects the buckets of the central route hash.
193  * 2) Only writers remove entries, and they hold the lock
194  *    as they look at rtable reference counts.
195  * 3) Only readers acquire references to rtable entries,
196  *    they do so with atomic increments and with the
197  *    lock held.
198  */
199
200 struct rt_hash_bucket {
201         struct rtable   *chain;
202 };
203
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205         defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ        256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ       4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ       2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ       1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ       512
222 # else
223 #  define RT_HASH_LOCK_SZ       256
224 # endif
225 #endif
226
227 static spinlock_t       *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229
230 static __init void rt_hash_lock_init(void)
231 {
232         int i;
233
234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235                         GFP_KERNEL);
236         if (!rt_hash_locks)
237                 panic("IP: failed to allocate rt_hash_locks\n");
238
239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240                 spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
251 static unsigned                 rt_hash_mask __read_mostly;
252 static unsigned int             rt_hash_log  __read_mostly;
253
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
256
257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
258                                    int genid)
259 {
260         return jhash_3words((__force u32)daddr, (__force u32)saddr,
261                             idx, genid)
262                 & rt_hash_mask;
263 }
264
265 static inline int rt_genid(struct net *net)
266 {
267         return atomic_read(&net->ipv4.rt_genid);
268 }
269
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272         struct seq_net_private p;
273         int bucket;
274         int genid;
275 };
276
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279         struct rt_cache_iter_state *st = seq->private;
280         struct rtable *r = NULL;
281
282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283                 if (!rt_hash_table[st->bucket].chain)
284                         continue;
285                 rcu_read_lock_bh();
286                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
289                             r->rt_genid == st->genid)
290                                 return r;
291                         r = rcu_dereference_bh(r->dst.rt_next);
292                 }
293                 rcu_read_unlock_bh();
294         }
295         return r;
296 }
297
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299                                           struct rtable *r)
300 {
301         struct rt_cache_iter_state *st = seq->private;
302
303         r = r->dst.rt_next;
304         while (!r) {
305                 rcu_read_unlock_bh();
306                 do {
307                         if (--st->bucket < 0)
308                                 return NULL;
309                 } while (!rt_hash_table[st->bucket].chain);
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference_bh(r);
314 }
315
316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
317                                         struct rtable *r)
318 {
319         struct rt_cache_iter_state *st = seq->private;
320         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
321                 if (dev_net(r->dst.dev) != seq_file_net(seq))
322                         continue;
323                 if (r->rt_genid == st->genid)
324                         break;
325         }
326         return r;
327 }
328
329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
330 {
331         struct rtable *r = rt_cache_get_first(seq);
332
333         if (r)
334                 while (pos && (r = rt_cache_get_next(seq, r)))
335                         --pos;
336         return pos ? NULL : r;
337 }
338
339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342         if (*pos)
343                 return rt_cache_get_idx(seq, *pos - 1);
344         st->genid = rt_genid(seq_file_net(seq));
345         return SEQ_START_TOKEN;
346 }
347
348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
349 {
350         struct rtable *r;
351
352         if (v == SEQ_START_TOKEN)
353                 r = rt_cache_get_first(seq);
354         else
355                 r = rt_cache_get_next(seq, v);
356         ++*pos;
357         return r;
358 }
359
360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
361 {
362         if (v && v != SEQ_START_TOKEN)
363                 rcu_read_unlock_bh();
364 }
365
366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
367 {
368         if (v == SEQ_START_TOKEN)
369                 seq_printf(seq, "%-127s\n",
370                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
371                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
372                            "HHUptod\tSpecDst");
373         else {
374                 struct rtable *r = v;
375                 int len;
376
377                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
378                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
379                         r->dst.dev ? r->dst.dev->name : "*",
380                         (__force u32)r->rt_dst,
381                         (__force u32)r->rt_gateway,
382                         r->rt_flags, atomic_read(&r->dst.__refcnt),
383                         r->dst.__use, 0, (__force u32)r->rt_src,
384                         (dst_metric(&r->dst, RTAX_ADVMSS) ?
385                              (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386                         dst_metric(&r->dst, RTAX_WINDOW),
387                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388                               dst_metric(&r->dst, RTAX_RTTVAR)),
389                         r->fl.fl4_tos,
390                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
391                         r->dst.hh ? (r->dst.hh->hh_output ==
392                                        dev_queue_xmit) : 0,
393                         r->rt_spec_dst, &len);
394
395                 seq_printf(seq, "%*s\n", 127 - len, "");
396         }
397         return 0;
398 }
399
400 static const struct seq_operations rt_cache_seq_ops = {
401         .start  = rt_cache_seq_start,
402         .next   = rt_cache_seq_next,
403         .stop   = rt_cache_seq_stop,
404         .show   = rt_cache_seq_show,
405 };
406
407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
408 {
409         return seq_open_net(inode, file, &rt_cache_seq_ops,
410                         sizeof(struct rt_cache_iter_state));
411 }
412
413 static const struct file_operations rt_cache_seq_fops = {
414         .owner   = THIS_MODULE,
415         .open    = rt_cache_seq_open,
416         .read    = seq_read,
417         .llseek  = seq_lseek,
418         .release = seq_release_net,
419 };
420
421
422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
423 {
424         int cpu;
425
426         if (*pos == 0)
427                 return SEQ_START_TOKEN;
428
429         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
430                 if (!cpu_possible(cpu))
431                         continue;
432                 *pos = cpu+1;
433                 return &per_cpu(rt_cache_stat, cpu);
434         }
435         return NULL;
436 }
437
438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
439 {
440         int cpu;
441
442         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
443                 if (!cpu_possible(cpu))
444                         continue;
445                 *pos = cpu+1;
446                 return &per_cpu(rt_cache_stat, cpu);
447         }
448         return NULL;
449
450 }
451
452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
453 {
454
455 }
456
457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
458 {
459         struct rt_cache_stat *st = v;
460
461         if (v == SEQ_START_TOKEN) {
462                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
463                 return 0;
464         }
465
466         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
467                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
468                    dst_entries_get_slow(&ipv4_dst_ops),
469                    st->in_hit,
470                    st->in_slow_tot,
471                    st->in_slow_mc,
472                    st->in_no_route,
473                    st->in_brd,
474                    st->in_martian_dst,
475                    st->in_martian_src,
476
477                    st->out_hit,
478                    st->out_slow_tot,
479                    st->out_slow_mc,
480
481                    st->gc_total,
482                    st->gc_ignored,
483                    st->gc_goal_miss,
484                    st->gc_dst_overflow,
485                    st->in_hlist_search,
486                    st->out_hlist_search
487                 );
488         return 0;
489 }
490
491 static const struct seq_operations rt_cpu_seq_ops = {
492         .start  = rt_cpu_seq_start,
493         .next   = rt_cpu_seq_next,
494         .stop   = rt_cpu_seq_stop,
495         .show   = rt_cpu_seq_show,
496 };
497
498
499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
500 {
501         return seq_open(file, &rt_cpu_seq_ops);
502 }
503
504 static const struct file_operations rt_cpu_seq_fops = {
505         .owner   = THIS_MODULE,
506         .open    = rt_cpu_seq_open,
507         .read    = seq_read,
508         .llseek  = seq_lseek,
509         .release = seq_release,
510 };
511
512 #ifdef CONFIG_NET_CLS_ROUTE
513 static int rt_acct_proc_show(struct seq_file *m, void *v)
514 {
515         struct ip_rt_acct *dst, *src;
516         unsigned int i, j;
517
518         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
519         if (!dst)
520                 return -ENOMEM;
521
522         for_each_possible_cpu(i) {
523                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
524                 for (j = 0; j < 256; j++) {
525                         dst[j].o_bytes   += src[j].o_bytes;
526                         dst[j].o_packets += src[j].o_packets;
527                         dst[j].i_bytes   += src[j].i_bytes;
528                         dst[j].i_packets += src[j].i_packets;
529                 }
530         }
531
532         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
533         kfree(dst);
534         return 0;
535 }
536
537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
538 {
539         return single_open(file, rt_acct_proc_show, NULL);
540 }
541
542 static const struct file_operations rt_acct_proc_fops = {
543         .owner          = THIS_MODULE,
544         .open           = rt_acct_proc_open,
545         .read           = seq_read,
546         .llseek         = seq_lseek,
547         .release        = single_release,
548 };
549 #endif
550
551 static int __net_init ip_rt_do_proc_init(struct net *net)
552 {
553         struct proc_dir_entry *pde;
554
555         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
556                         &rt_cache_seq_fops);
557         if (!pde)
558                 goto err1;
559
560         pde = proc_create("rt_cache", S_IRUGO,
561                           net->proc_net_stat, &rt_cpu_seq_fops);
562         if (!pde)
563                 goto err2;
564
565 #ifdef CONFIG_NET_CLS_ROUTE
566         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
567         if (!pde)
568                 goto err3;
569 #endif
570         return 0;
571
572 #ifdef CONFIG_NET_CLS_ROUTE
573 err3:
574         remove_proc_entry("rt_cache", net->proc_net_stat);
575 #endif
576 err2:
577         remove_proc_entry("rt_cache", net->proc_net);
578 err1:
579         return -ENOMEM;
580 }
581
582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
583 {
584         remove_proc_entry("rt_cache", net->proc_net_stat);
585         remove_proc_entry("rt_cache", net->proc_net);
586 #ifdef CONFIG_NET_CLS_ROUTE
587         remove_proc_entry("rt_acct", net->proc_net);
588 #endif
589 }
590
591 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
592         .init = ip_rt_do_proc_init,
593         .exit = ip_rt_do_proc_exit,
594 };
595
596 static int __init ip_rt_proc_init(void)
597 {
598         return register_pernet_subsys(&ip_rt_proc_ops);
599 }
600
601 #else
602 static inline int ip_rt_proc_init(void)
603 {
604         return 0;
605 }
606 #endif /* CONFIG_PROC_FS */
607
608 static inline void rt_free(struct rtable *rt)
609 {
610         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
611 }
612
613 static inline void rt_drop(struct rtable *rt)
614 {
615         ip_rt_put(rt);
616         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
617 }
618
619 static inline int rt_fast_clean(struct rtable *rth)
620 {
621         /* Kill broadcast/multicast entries very aggresively, if they
622            collide in hash table with more useful entries */
623         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624                 rth->fl.iif && rth->dst.rt_next;
625 }
626
627 static inline int rt_valuable(struct rtable *rth)
628 {
629         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
630                 rth->dst.expires;
631 }
632
633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
634 {
635         unsigned long age;
636         int ret = 0;
637
638         if (atomic_read(&rth->dst.__refcnt))
639                 goto out;
640
641         ret = 1;
642         if (rth->dst.expires &&
643             time_after_eq(jiffies, rth->dst.expires))
644                 goto out;
645
646         age = jiffies - rth->dst.lastuse;
647         ret = 0;
648         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
649             (age <= tmo2 && rt_valuable(rth)))
650                 goto out;
651         ret = 1;
652 out:    return ret;
653 }
654
655 /* Bits of score are:
656  * 31: very valuable
657  * 30: not quite useless
658  * 29..0: usage counter
659  */
660 static inline u32 rt_score(struct rtable *rt)
661 {
662         u32 score = jiffies - rt->dst.lastuse;
663
664         score = ~score & ~(3<<30);
665
666         if (rt_valuable(rt))
667                 score |= (1<<31);
668
669         if (!rt->fl.iif ||
670             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671                 score |= (1<<30);
672
673         return score;
674 }
675
676 static inline bool rt_caching(const struct net *net)
677 {
678         return net->ipv4.current_rt_cache_rebuild_count <=
679                 net->ipv4.sysctl_rt_cache_rebuild_count;
680 }
681
682 static inline bool compare_hash_inputs(const struct flowi *fl1,
683                                         const struct flowi *fl2)
684 {
685         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
686                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
687                 (fl1->iif ^ fl2->iif)) == 0);
688 }
689
690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
691 {
692         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
693                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
694                 (fl1->mark ^ fl2->mark) |
695                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
696                 (fl1->oif ^ fl2->oif) |
697                 (fl1->iif ^ fl2->iif)) == 0;
698 }
699
700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
701 {
702         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
703 }
704
705 static inline int rt_is_expired(struct rtable *rth)
706 {
707         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
708 }
709
710 /*
711  * Perform a full scan of hash table and free all entries.
712  * Can be called by a softirq or a process.
713  * In the later case, we want to be reschedule if necessary
714  */
715 static void rt_do_flush(int process_context)
716 {
717         unsigned int i;
718         struct rtable *rth, *next;
719         struct rtable * tail;
720
721         for (i = 0; i <= rt_hash_mask; i++) {
722                 if (process_context && need_resched())
723                         cond_resched();
724                 rth = rt_hash_table[i].chain;
725                 if (!rth)
726                         continue;
727
728                 spin_lock_bh(rt_hash_lock_addr(i));
729 #ifdef CONFIG_NET_NS
730                 {
731                 struct rtable ** prev, * p;
732
733                 rth = rt_hash_table[i].chain;
734
735                 /* defer releasing the head of the list after spin_unlock */
736                 for (tail = rth; tail; tail = tail->dst.rt_next)
737                         if (!rt_is_expired(tail))
738                                 break;
739                 if (rth != tail)
740                         rt_hash_table[i].chain = tail;
741
742                 /* call rt_free on entries after the tail requiring flush */
743                 prev = &rt_hash_table[i].chain;
744                 for (p = *prev; p; p = next) {
745                         next = p->dst.rt_next;
746                         if (!rt_is_expired(p)) {
747                                 prev = &p->dst.rt_next;
748                         } else {
749                                 *prev = next;
750                                 rt_free(p);
751                         }
752                 }
753                 }
754 #else
755                 rth = rt_hash_table[i].chain;
756                 rt_hash_table[i].chain = NULL;
757                 tail = NULL;
758 #endif
759                 spin_unlock_bh(rt_hash_lock_addr(i));
760
761                 for (; rth != tail; rth = next) {
762                         next = rth->dst.rt_next;
763                         rt_free(rth);
764                 }
765         }
766 }
767
768 /*
769  * While freeing expired entries, we compute average chain length
770  * and standard deviation, using fixed-point arithmetic.
771  * This to have an estimation of rt_chain_length_max
772  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
773  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
774  */
775
776 #define FRACT_BITS 3
777 #define ONE (1UL << FRACT_BITS)
778
779 /*
780  * Given a hash chain and an item in this hash chain,
781  * find if a previous entry has the same hash_inputs
782  * (but differs on tos, mark or oif)
783  * Returns 0 if an alias is found.
784  * Returns ONE if rth has no alias before itself.
785  */
786 static int has_noalias(const struct rtable *head, const struct rtable *rth)
787 {
788         const struct rtable *aux = head;
789
790         while (aux != rth) {
791                 if (compare_hash_inputs(&aux->fl, &rth->fl))
792                         return 0;
793                 aux = aux->dst.rt_next;
794         }
795         return ONE;
796 }
797
798 static void rt_check_expire(void)
799 {
800         static unsigned int rover;
801         unsigned int i = rover, goal;
802         struct rtable *rth, **rthp;
803         unsigned long samples = 0;
804         unsigned long sum = 0, sum2 = 0;
805         unsigned long delta;
806         u64 mult;
807
808         delta = jiffies - expires_ljiffies;
809         expires_ljiffies = jiffies;
810         mult = ((u64)delta) << rt_hash_log;
811         if (ip_rt_gc_timeout > 1)
812                 do_div(mult, ip_rt_gc_timeout);
813         goal = (unsigned int)mult;
814         if (goal > rt_hash_mask)
815                 goal = rt_hash_mask + 1;
816         for (; goal > 0; goal--) {
817                 unsigned long tmo = ip_rt_gc_timeout;
818                 unsigned long length;
819
820                 i = (i + 1) & rt_hash_mask;
821                 rthp = &rt_hash_table[i].chain;
822
823                 if (need_resched())
824                         cond_resched();
825
826                 samples++;
827
828                 if (*rthp == NULL)
829                         continue;
830                 length = 0;
831                 spin_lock_bh(rt_hash_lock_addr(i));
832                 while ((rth = *rthp) != NULL) {
833                         prefetch(rth->dst.rt_next);
834                         if (rt_is_expired(rth)) {
835                                 *rthp = rth->dst.rt_next;
836                                 rt_free(rth);
837                                 continue;
838                         }
839                         if (rth->dst.expires) {
840                                 /* Entry is expired even if it is in use */
841                                 if (time_before_eq(jiffies, rth->dst.expires)) {
842 nofree:
843                                         tmo >>= 1;
844                                         rthp = &rth->dst.rt_next;
845                                         /*
846                                          * We only count entries on
847                                          * a chain with equal hash inputs once
848                                          * so that entries for different QOS
849                                          * levels, and other non-hash input
850                                          * attributes don't unfairly skew
851                                          * the length computation
852                                          */
853                                         length += has_noalias(rt_hash_table[i].chain, rth);
854                                         continue;
855                                 }
856                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857                                 goto nofree;
858
859                         /* Cleanup aged off entries. */
860                         *rthp = rth->dst.rt_next;
861                         rt_free(rth);
862                 }
863                 spin_unlock_bh(rt_hash_lock_addr(i));
864                 sum += length;
865                 sum2 += length*length;
866         }
867         if (samples) {
868                 unsigned long avg = sum / samples;
869                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870                 rt_chain_length_max = max_t(unsigned long,
871                                         ip_rt_gc_elasticity,
872                                         (avg + 4*sd) >> FRACT_BITS);
873         }
874         rover = i;
875 }
876
877 /*
878  * rt_worker_func() is run in process context.
879  * we call rt_check_expire() to scan part of the hash table
880  */
881 static void rt_worker_func(struct work_struct *work)
882 {
883         rt_check_expire();
884         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885 }
886
887 /*
888  * Pertubation of rt_genid by a small quantity [1..256]
889  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
890  * many times (2^24) without giving recent rt_genid.
891  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
892  */
893 static void rt_cache_invalidate(struct net *net)
894 {
895         unsigned char shuffle;
896
897         get_random_bytes(&shuffle, sizeof(shuffle));
898         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
899 }
900
901 /*
902  * delay < 0  : invalidate cache (fast : entries will be deleted later)
903  * delay >= 0 : invalidate & flush cache (can be long)
904  */
905 void rt_cache_flush(struct net *net, int delay)
906 {
907         rt_cache_invalidate(net);
908         if (delay >= 0)
909                 rt_do_flush(!in_softirq());
910 }
911
912 /* Flush previous cache invalidated entries from the cache */
913 void rt_cache_flush_batch(void)
914 {
915         rt_do_flush(!in_softirq());
916 }
917
918 static void rt_emergency_hash_rebuild(struct net *net)
919 {
920         if (net_ratelimit())
921                 printk(KERN_WARNING "Route hash chain too long!\n");
922         rt_cache_invalidate(net);
923 }
924
925 /*
926    Short description of GC goals.
927
928    We want to build algorithm, which will keep routing cache
929    at some equilibrium point, when number of aged off entries
930    is kept approximately equal to newly generated ones.
931
932    Current expiration strength is variable "expire".
933    We try to adjust it dynamically, so that if networking
934    is idle expires is large enough to keep enough of warm entries,
935    and when load increases it reduces to limit cache size.
936  */
937
938 static int rt_garbage_collect(struct dst_ops *ops)
939 {
940         static unsigned long expire = RT_GC_TIMEOUT;
941         static unsigned long last_gc;
942         static int rover;
943         static int equilibrium;
944         struct rtable *rth, **rthp;
945         unsigned long now = jiffies;
946         int goal;
947         int entries = dst_entries_get_fast(&ipv4_dst_ops);
948
949         /*
950          * Garbage collection is pretty expensive,
951          * do not make it too frequently.
952          */
953
954         RT_CACHE_STAT_INC(gc_total);
955
956         if (now - last_gc < ip_rt_gc_min_interval &&
957             entries < ip_rt_max_size) {
958                 RT_CACHE_STAT_INC(gc_ignored);
959                 goto out;
960         }
961
962         entries = dst_entries_get_slow(&ipv4_dst_ops);
963         /* Calculate number of entries, which we want to expire now. */
964         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
965         if (goal <= 0) {
966                 if (equilibrium < ipv4_dst_ops.gc_thresh)
967                         equilibrium = ipv4_dst_ops.gc_thresh;
968                 goal = entries - equilibrium;
969                 if (goal > 0) {
970                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971                         goal = entries - equilibrium;
972                 }
973         } else {
974                 /* We are in dangerous area. Try to reduce cache really
975                  * aggressively.
976                  */
977                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978                 equilibrium = entries - goal;
979         }
980
981         if (now - last_gc >= ip_rt_gc_min_interval)
982                 last_gc = now;
983
984         if (goal <= 0) {
985                 equilibrium += goal;
986                 goto work_done;
987         }
988
989         do {
990                 int i, k;
991
992                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993                         unsigned long tmo = expire;
994
995                         k = (k + 1) & rt_hash_mask;
996                         rthp = &rt_hash_table[k].chain;
997                         spin_lock_bh(rt_hash_lock_addr(k));
998                         while ((rth = *rthp) != NULL) {
999                                 if (!rt_is_expired(rth) &&
1000                                         !rt_may_expire(rth, tmo, expire)) {
1001                                         tmo >>= 1;
1002                                         rthp = &rth->dst.rt_next;
1003                                         continue;
1004                                 }
1005                                 *rthp = rth->dst.rt_next;
1006                                 rt_free(rth);
1007                                 goal--;
1008                         }
1009                         spin_unlock_bh(rt_hash_lock_addr(k));
1010                         if (goal <= 0)
1011                                 break;
1012                 }
1013                 rover = k;
1014
1015                 if (goal <= 0)
1016                         goto work_done;
1017
1018                 /* Goal is not achieved. We stop process if:
1019
1020                    - if expire reduced to zero. Otherwise, expire is halfed.
1021                    - if table is not full.
1022                    - if we are called from interrupt.
1023                    - jiffies check is just fallback/debug loop breaker.
1024                      We will not spin here for long time in any case.
1025                  */
1026
1027                 RT_CACHE_STAT_INC(gc_goal_miss);
1028
1029                 if (expire == 0)
1030                         break;
1031
1032                 expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036 #endif
1037
1038                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039                         goto out;
1040         } while (!in_softirq() && time_before_eq(jiffies, now));
1041
1042         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1043                 goto out;
1044         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1045                 goto out;
1046         if (net_ratelimit())
1047                 printk(KERN_WARNING "dst cache overflow\n");
1048         RT_CACHE_STAT_INC(gc_dst_overflow);
1049         return 1;
1050
1051 work_done:
1052         expire += ip_rt_gc_min_interval;
1053         if (expire > ip_rt_gc_timeout ||
1054             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1056                 expire = ip_rt_gc_timeout;
1057 #if RT_CACHE_DEBUG >= 2
1058         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1059                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1060 #endif
1061 out:    return 0;
1062 }
1063
1064 /*
1065  * Returns number of entries in a hash chain that have different hash_inputs
1066  */
1067 static int slow_chain_length(const struct rtable *head)
1068 {
1069         int length = 0;
1070         const struct rtable *rth = head;
1071
1072         while (rth) {
1073                 length += has_noalias(head, rth);
1074                 rth = rth->dst.rt_next;
1075         }
1076         return length >> FRACT_BITS;
1077 }
1078
1079 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1080                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1081 {
1082         struct rtable   *rth, **rthp;
1083         unsigned long   now;
1084         struct rtable *cand, **candp;
1085         u32             min_score;
1086         int             chain_length;
1087         int attempts = !in_softirq();
1088
1089 restart:
1090         chain_length = 0;
1091         min_score = ~(u32)0;
1092         cand = NULL;
1093         candp = NULL;
1094         now = jiffies;
1095
1096         if (!rt_caching(dev_net(rt->dst.dev))) {
1097                 /*
1098                  * If we're not caching, just tell the caller we
1099                  * were successful and don't touch the route.  The
1100                  * caller hold the sole reference to the cache entry, and
1101                  * it will be released when the caller is done with it.
1102                  * If we drop it here, the callers have no way to resolve routes
1103                  * when we're not caching.  Instead, just point *rp at rt, so
1104                  * the caller gets a single use out of the route
1105                  * Note that we do rt_free on this new route entry, so that
1106                  * once its refcount hits zero, we are still able to reap it
1107                  * (Thanks Alexey)
1108                  * Note also the rt_free uses call_rcu.  We don't actually
1109                  * need rcu protection here, this is just our path to get
1110                  * on the route gc list.
1111                  */
1112
1113                 rt->dst.flags |= DST_NOCACHE;
1114                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1115                         int err = arp_bind_neighbour(&rt->dst);
1116                         if (err) {
1117                                 if (net_ratelimit())
1118                                         printk(KERN_WARNING
1119                                             "Neighbour table failure & not caching routes.\n");
1120                                 rt_drop(rt);
1121                                 return err;
1122                         }
1123                 }
1124
1125                 rt_free(rt);
1126                 goto skip_hashing;
1127         }
1128
1129         rthp = &rt_hash_table[hash].chain;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         while ((rth = *rthp) != NULL) {
1133                 if (rt_is_expired(rth)) {
1134                         *rthp = rth->dst.rt_next;
1135                         rt_free(rth);
1136                         continue;
1137                 }
1138                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1139                         /* Put it first */
1140                         *rthp = rth->dst.rt_next;
1141                         /*
1142                          * Since lookup is lockfree, the deletion
1143                          * must be visible to another weakly ordered CPU before
1144                          * the insertion at the start of the hash chain.
1145                          */
1146                         rcu_assign_pointer(rth->dst.rt_next,
1147                                            rt_hash_table[hash].chain);
1148                         /*
1149                          * Since lookup is lockfree, the update writes
1150                          * must be ordered for consistency on SMP.
1151                          */
1152                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1153
1154                         dst_use(&rth->dst, now);
1155                         spin_unlock_bh(rt_hash_lock_addr(hash));
1156
1157                         rt_drop(rt);
1158                         if (rp)
1159                                 *rp = rth;
1160                         else
1161                                 skb_dst_set(skb, &rth->dst);
1162                         return 0;
1163                 }
1164
1165                 if (!atomic_read(&rth->dst.__refcnt)) {
1166                         u32 score = rt_score(rth);
1167
1168                         if (score <= min_score) {
1169                                 cand = rth;
1170                                 candp = rthp;
1171                                 min_score = score;
1172                         }
1173                 }
1174
1175                 chain_length++;
1176
1177                 rthp = &rth->dst.rt_next;
1178         }
1179
1180         if (cand) {
1181                 /* ip_rt_gc_elasticity used to be average length of chain
1182                  * length, when exceeded gc becomes really aggressive.
1183                  *
1184                  * The second limit is less certain. At the moment it allows
1185                  * only 2 entries per bucket. We will see.
1186                  */
1187                 if (chain_length > ip_rt_gc_elasticity) {
1188                         *candp = cand->dst.rt_next;
1189                         rt_free(cand);
1190                 }
1191         } else {
1192                 if (chain_length > rt_chain_length_max &&
1193                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1194                         struct net *net = dev_net(rt->dst.dev);
1195                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1196                         if (!rt_caching(net)) {
1197                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1198                                         rt->dst.dev->name, num);
1199                         }
1200                         rt_emergency_hash_rebuild(net);
1201                         spin_unlock_bh(rt_hash_lock_addr(hash));
1202
1203                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1204                                         ifindex, rt_genid(net));
1205                         goto restart;
1206                 }
1207         }
1208
1209         /* Try to bind route to arp only if it is output
1210            route or unicast forwarding path.
1211          */
1212         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1213                 int err = arp_bind_neighbour(&rt->dst);
1214                 if (err) {
1215                         spin_unlock_bh(rt_hash_lock_addr(hash));
1216
1217                         if (err != -ENOBUFS) {
1218                                 rt_drop(rt);
1219                                 return err;
1220                         }
1221
1222                         /* Neighbour tables are full and nothing
1223                            can be released. Try to shrink route cache,
1224                            it is most likely it holds some neighbour records.
1225                          */
1226                         if (attempts-- > 0) {
1227                                 int saved_elasticity = ip_rt_gc_elasticity;
1228                                 int saved_int = ip_rt_gc_min_interval;
1229                                 ip_rt_gc_elasticity     = 1;
1230                                 ip_rt_gc_min_interval   = 0;
1231                                 rt_garbage_collect(&ipv4_dst_ops);
1232                                 ip_rt_gc_min_interval   = saved_int;
1233                                 ip_rt_gc_elasticity     = saved_elasticity;
1234                                 goto restart;
1235                         }
1236
1237                         if (net_ratelimit())
1238                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1239                         rt_drop(rt);
1240                         return -ENOBUFS;
1241                 }
1242         }
1243
1244         rt->dst.rt_next = rt_hash_table[hash].chain;
1245
1246 #if RT_CACHE_DEBUG >= 2
1247         if (rt->dst.rt_next) {
1248                 struct rtable *trt;
1249                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1250                        hash, &rt->rt_dst);
1251                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1252                         printk(" . %pI4", &trt->rt_dst);
1253                 printk("\n");
1254         }
1255 #endif
1256         /*
1257          * Since lookup is lockfree, we must make sure
1258          * previous writes to rt are comitted to memory
1259          * before making rt visible to other CPUS.
1260          */
1261         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1262
1263         spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265 skip_hashing:
1266         if (rp)
1267                 *rp = rt;
1268         else
1269                 skb_dst_set(skb, &rt->dst);
1270         return 0;
1271 }
1272
1273 void rt_bind_peer(struct rtable *rt, int create)
1274 {
1275         struct inet_peer *peer;
1276
1277         peer = inet_getpeer(rt->rt_dst, create);
1278
1279         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1280                 inet_putpeer(peer);
1281 }
1282
1283 /*
1284  * Peer allocation may fail only in serious out-of-memory conditions.  However
1285  * we still can generate some output.
1286  * Random ID selection looks a bit dangerous because we have no chances to
1287  * select ID being unique in a reasonable period of time.
1288  * But broken packet identifier may be better than no packet at all.
1289  */
1290 static void ip_select_fb_ident(struct iphdr *iph)
1291 {
1292         static DEFINE_SPINLOCK(ip_fb_id_lock);
1293         static u32 ip_fallback_id;
1294         u32 salt;
1295
1296         spin_lock_bh(&ip_fb_id_lock);
1297         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1298         iph->id = htons(salt & 0xFFFF);
1299         ip_fallback_id = salt;
1300         spin_unlock_bh(&ip_fb_id_lock);
1301 }
1302
1303 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1304 {
1305         struct rtable *rt = (struct rtable *) dst;
1306
1307         if (rt) {
1308                 if (rt->peer == NULL)
1309                         rt_bind_peer(rt, 1);
1310
1311                 /* If peer is attached to destination, it is never detached,
1312                    so that we need not to grab a lock to dereference it.
1313                  */
1314                 if (rt->peer) {
1315                         iph->id = htons(inet_getid(rt->peer, more));
1316                         return;
1317                 }
1318         } else
1319                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1320                        __builtin_return_address(0));
1321
1322         ip_select_fb_ident(iph);
1323 }
1324 EXPORT_SYMBOL(__ip_select_ident);
1325
1326 static void rt_del(unsigned hash, struct rtable *rt)
1327 {
1328         struct rtable **rthp, *aux;
1329
1330         rthp = &rt_hash_table[hash].chain;
1331         spin_lock_bh(rt_hash_lock_addr(hash));
1332         ip_rt_put(rt);
1333         while ((aux = *rthp) != NULL) {
1334                 if (aux == rt || rt_is_expired(aux)) {
1335                         *rthp = aux->dst.rt_next;
1336                         rt_free(aux);
1337                         continue;
1338                 }
1339                 rthp = &aux->dst.rt_next;
1340         }
1341         spin_unlock_bh(rt_hash_lock_addr(hash));
1342 }
1343
1344 /* called in rcu_read_lock() section */
1345 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1346                     __be32 saddr, struct net_device *dev)
1347 {
1348         int i, k;
1349         struct in_device *in_dev = __in_dev_get_rcu(dev);
1350         struct rtable *rth, **rthp;
1351         __be32  skeys[2] = { saddr, 0 };
1352         int  ikeys[2] = { dev->ifindex, 0 };
1353         struct netevent_redirect netevent;
1354         struct net *net;
1355
1356         if (!in_dev)
1357                 return;
1358
1359         net = dev_net(dev);
1360         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1361             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1362             ipv4_is_zeronet(new_gw))
1363                 goto reject_redirect;
1364
1365         if (!rt_caching(net))
1366                 goto reject_redirect;
1367
1368         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1369                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1370                         goto reject_redirect;
1371                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1372                         goto reject_redirect;
1373         } else {
1374                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1375                         goto reject_redirect;
1376         }
1377
1378         for (i = 0; i < 2; i++) {
1379                 for (k = 0; k < 2; k++) {
1380                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1381                                                 rt_genid(net));
1382
1383                         rthp=&rt_hash_table[hash].chain;
1384
1385                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1386                                 struct rtable *rt;
1387
1388                                 if (rth->fl.fl4_dst != daddr ||
1389                                     rth->fl.fl4_src != skeys[i] ||
1390                                     rth->fl.oif != ikeys[k] ||
1391                                     rth->fl.iif != 0 ||
1392                                     rt_is_expired(rth) ||
1393                                     !net_eq(dev_net(rth->dst.dev), net)) {
1394                                         rthp = &rth->dst.rt_next;
1395                                         continue;
1396                                 }
1397
1398                                 if (rth->rt_dst != daddr ||
1399                                     rth->rt_src != saddr ||
1400                                     rth->dst.error ||
1401                                     rth->rt_gateway != old_gw ||
1402                                     rth->dst.dev != dev)
1403                                         break;
1404
1405                                 dst_hold(&rth->dst);
1406
1407                                 rt = dst_alloc(&ipv4_dst_ops);
1408                                 if (rt == NULL) {
1409                                         ip_rt_put(rth);
1410                                         return;
1411                                 }
1412
1413                                 /* Copy all the information. */
1414                                 *rt = *rth;
1415                                 rt->dst.__use           = 1;
1416                                 atomic_set(&rt->dst.__refcnt, 1);
1417                                 rt->dst.child           = NULL;
1418                                 if (rt->dst.dev)
1419                                         dev_hold(rt->dst.dev);
1420                                 if (rt->idev)
1421                                         in_dev_hold(rt->idev);
1422                                 rt->dst.obsolete        = -1;
1423                                 rt->dst.lastuse = jiffies;
1424                                 rt->dst.path            = &rt->dst;
1425                                 rt->dst.neighbour       = NULL;
1426                                 rt->dst.hh              = NULL;
1427 #ifdef CONFIG_XFRM
1428                                 rt->dst.xfrm            = NULL;
1429 #endif
1430                                 rt->rt_genid            = rt_genid(net);
1431                                 rt->rt_flags            |= RTCF_REDIRECTED;
1432
1433                                 /* Gateway is different ... */
1434                                 rt->rt_gateway          = new_gw;
1435
1436                                 /* Redirect received -> path was valid */
1437                                 dst_confirm(&rth->dst);
1438
1439                                 if (rt->peer)
1440                                         atomic_inc(&rt->peer->refcnt);
1441
1442                                 if (arp_bind_neighbour(&rt->dst) ||
1443                                     !(rt->dst.neighbour->nud_state &
1444                                             NUD_VALID)) {
1445                                         if (rt->dst.neighbour)
1446                                                 neigh_event_send(rt->dst.neighbour, NULL);
1447                                         ip_rt_put(rth);
1448                                         rt_drop(rt);
1449                                         goto do_next;
1450                                 }
1451
1452                                 netevent.old = &rth->dst;
1453                                 netevent.new = &rt->dst;
1454                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1455                                                         &netevent);
1456
1457                                 rt_del(hash, rth);
1458                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1459                                         ip_rt_put(rt);
1460                                 goto do_next;
1461                         }
1462                 do_next:
1463                         ;
1464                 }
1465         }
1466         return;
1467
1468 reject_redirect:
1469 #ifdef CONFIG_IP_ROUTE_VERBOSE
1470         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472                         "  Advised path = %pI4 -> %pI4\n",
1473                        &old_gw, dev->name, &new_gw,
1474                        &saddr, &daddr);
1475 #endif
1476         ;
1477 }
1478
1479 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1480 {
1481         struct rtable *rt = (struct rtable *)dst;
1482         struct dst_entry *ret = dst;
1483
1484         if (rt) {
1485                 if (dst->obsolete > 0) {
1486                         ip_rt_put(rt);
1487                         ret = NULL;
1488                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489                            (rt->dst.expires &&
1490                             time_after_eq(jiffies, rt->dst.expires))) {
1491                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1492                                                 rt->fl.oif,
1493                                                 rt_genid(dev_net(dst->dev)));
1494 #if RT_CACHE_DEBUG >= 1
1495                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1496                                 &rt->rt_dst, rt->fl.fl4_tos);
1497 #endif
1498                         rt_del(hash, rt);
1499                         ret = NULL;
1500                 }
1501         }
1502         return ret;
1503 }
1504
1505 /*
1506  * Algorithm:
1507  *      1. The first ip_rt_redirect_number redirects are sent
1508  *         with exponential backoff, then we stop sending them at all,
1509  *         assuming that the host ignores our redirects.
1510  *      2. If we did not see packets requiring redirects
1511  *         during ip_rt_redirect_silence, we assume that the host
1512  *         forgot redirected route and start to send redirects again.
1513  *
1514  * This algorithm is much cheaper and more intelligent than dumb load limiting
1515  * in icmp.c.
1516  *
1517  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1518  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1519  */
1520
1521 void ip_rt_send_redirect(struct sk_buff *skb)
1522 {
1523         struct rtable *rt = skb_rtable(skb);
1524         struct in_device *in_dev;
1525         int log_martians;
1526
1527         rcu_read_lock();
1528         in_dev = __in_dev_get_rcu(rt->dst.dev);
1529         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1530                 rcu_read_unlock();
1531                 return;
1532         }
1533         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1534         rcu_read_unlock();
1535
1536         /* No redirected packets during ip_rt_redirect_silence;
1537          * reset the algorithm.
1538          */
1539         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1540                 rt->dst.rate_tokens = 0;
1541
1542         /* Too many ignored redirects; do not send anything
1543          * set dst.rate_last to the last seen redirected packet.
1544          */
1545         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1546                 rt->dst.rate_last = jiffies;
1547                 return;
1548         }
1549
1550         /* Check for load limit; set rate_last to the latest sent
1551          * redirect.
1552          */
1553         if (rt->dst.rate_tokens == 0 ||
1554             time_after(jiffies,
1555                        (rt->dst.rate_last +
1556                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1557                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1558                 rt->dst.rate_last = jiffies;
1559                 ++rt->dst.rate_tokens;
1560 #ifdef CONFIG_IP_ROUTE_VERBOSE
1561                 if (log_martians &&
1562                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1563                     net_ratelimit())
1564                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1565                                 &rt->rt_src, rt->rt_iif,
1566                                 &rt->rt_dst, &rt->rt_gateway);
1567 #endif
1568         }
1569 }
1570
1571 static int ip_error(struct sk_buff *skb)
1572 {
1573         struct rtable *rt = skb_rtable(skb);
1574         unsigned long now;
1575         int code;
1576
1577         switch (rt->dst.error) {
1578                 case EINVAL:
1579                 default:
1580                         goto out;
1581                 case EHOSTUNREACH:
1582                         code = ICMP_HOST_UNREACH;
1583                         break;
1584                 case ENETUNREACH:
1585                         code = ICMP_NET_UNREACH;
1586                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1587                                         IPSTATS_MIB_INNOROUTES);
1588                         break;
1589                 case EACCES:
1590                         code = ICMP_PKT_FILTERED;
1591                         break;
1592         }
1593
1594         now = jiffies;
1595         rt->dst.rate_tokens += now - rt->dst.rate_last;
1596         if (rt->dst.rate_tokens > ip_rt_error_burst)
1597                 rt->dst.rate_tokens = ip_rt_error_burst;
1598         rt->dst.rate_last = now;
1599         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1600                 rt->dst.rate_tokens -= ip_rt_error_cost;
1601                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1602         }
1603
1604 out:    kfree_skb(skb);
1605         return 0;
1606 }
1607
1608 /*
1609  *      The last two values are not from the RFC but
1610  *      are needed for AMPRnet AX.25 paths.
1611  */
1612
1613 static const unsigned short mtu_plateau[] =
1614 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1615
1616 static inline unsigned short guess_mtu(unsigned short old_mtu)
1617 {
1618         int i;
1619
1620         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1621                 if (old_mtu > mtu_plateau[i])
1622                         return mtu_plateau[i];
1623         return 68;
1624 }
1625
1626 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1627                                  unsigned short new_mtu,
1628                                  struct net_device *dev)
1629 {
1630         int i, k;
1631         unsigned short old_mtu = ntohs(iph->tot_len);
1632         struct rtable *rth;
1633         int  ikeys[2] = { dev->ifindex, 0 };
1634         __be32  skeys[2] = { iph->saddr, 0, };
1635         __be32  daddr = iph->daddr;
1636         unsigned short est_mtu = 0;
1637
1638         for (k = 0; k < 2; k++) {
1639                 for (i = 0; i < 2; i++) {
1640                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1641                                                 rt_genid(net));
1642
1643                         rcu_read_lock();
1644                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1645                              rth = rcu_dereference(rth->dst.rt_next)) {
1646                                 unsigned short mtu = new_mtu;
1647
1648                                 if (rth->fl.fl4_dst != daddr ||
1649                                     rth->fl.fl4_src != skeys[i] ||
1650                                     rth->rt_dst != daddr ||
1651                                     rth->rt_src != iph->saddr ||
1652                                     rth->fl.oif != ikeys[k] ||
1653                                     rth->fl.iif != 0 ||
1654                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1655                                     !net_eq(dev_net(rth->dst.dev), net) ||
1656                                     rt_is_expired(rth))
1657                                         continue;
1658
1659                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1660
1661                                         /* BSD 4.2 compatibility hack :-( */
1662                                         if (mtu == 0 &&
1663                                             old_mtu >= dst_mtu(&rth->dst) &&
1664                                             old_mtu >= 68 + (iph->ihl << 2))
1665                                                 old_mtu -= iph->ihl << 2;
1666
1667                                         mtu = guess_mtu(old_mtu);
1668                                 }
1669                                 if (mtu <= dst_mtu(&rth->dst)) {
1670                                         if (mtu < dst_mtu(&rth->dst)) {
1671                                                 dst_confirm(&rth->dst);
1672                                                 if (mtu < ip_rt_min_pmtu) {
1673                                                         mtu = ip_rt_min_pmtu;
1674                                                         rth->dst.metrics[RTAX_LOCK-1] |=
1675                                                                 (1 << RTAX_MTU);
1676                                                 }
1677                                                 rth->dst.metrics[RTAX_MTU-1] = mtu;
1678                                                 dst_set_expires(&rth->dst,
1679                                                         ip_rt_mtu_expires);
1680                                         }
1681                                         est_mtu = mtu;
1682                                 }
1683                         }
1684                         rcu_read_unlock();
1685                 }
1686         }
1687         return est_mtu ? : new_mtu;
1688 }
1689
1690 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1691 {
1692         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1693             !(dst_metric_locked(dst, RTAX_MTU))) {
1694                 if (mtu < ip_rt_min_pmtu) {
1695                         mtu = ip_rt_min_pmtu;
1696                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1697                 }
1698                 dst->metrics[RTAX_MTU-1] = mtu;
1699                 dst_set_expires(dst, ip_rt_mtu_expires);
1700                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1701         }
1702 }
1703
1704 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1705 {
1706         if (rt_is_expired((struct rtable *)dst))
1707                 return NULL;
1708         return dst;
1709 }
1710
1711 static void ipv4_dst_destroy(struct dst_entry *dst)
1712 {
1713         struct rtable *rt = (struct rtable *) dst;
1714         struct inet_peer *peer = rt->peer;
1715         struct in_device *idev = rt->idev;
1716
1717         if (peer) {
1718                 rt->peer = NULL;
1719                 inet_putpeer(peer);
1720         }
1721
1722         if (idev) {
1723                 rt->idev = NULL;
1724                 in_dev_put(idev);
1725         }
1726 }
1727
1728 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1729                             int how)
1730 {
1731         struct rtable *rt = (struct rtable *) dst;
1732         struct in_device *idev = rt->idev;
1733         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1734                 struct in_device *loopback_idev =
1735                         in_dev_get(dev_net(dev)->loopback_dev);
1736                 if (loopback_idev) {
1737                         rt->idev = loopback_idev;
1738                         in_dev_put(idev);
1739                 }
1740         }
1741 }
1742
1743 static void ipv4_link_failure(struct sk_buff *skb)
1744 {
1745         struct rtable *rt;
1746
1747         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1748
1749         rt = skb_rtable(skb);
1750         if (rt)
1751                 dst_set_expires(&rt->dst, 0);
1752 }
1753
1754 static int ip_rt_bug(struct sk_buff *skb)
1755 {
1756         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1757                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1758                 skb->dev ? skb->dev->name : "?");
1759         kfree_skb(skb);
1760         return 0;
1761 }
1762
1763 /*
1764    We do not cache source address of outgoing interface,
1765    because it is used only by IP RR, TS and SRR options,
1766    so that it out of fast path.
1767
1768    BTW remember: "addr" is allowed to be not aligned
1769    in IP options!
1770  */
1771
1772 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1773 {
1774         __be32 src;
1775         struct fib_result res;
1776
1777         if (rt->fl.iif == 0)
1778                 src = rt->rt_src;
1779         else {
1780                 rcu_read_lock();
1781                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1782                         src = FIB_RES_PREFSRC(res);
1783                 else
1784                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1785                                         RT_SCOPE_UNIVERSE);
1786                 rcu_read_unlock();
1787         }
1788         memcpy(addr, &src, 4);
1789 }
1790
1791 #ifdef CONFIG_NET_CLS_ROUTE
1792 static void set_class_tag(struct rtable *rt, u32 tag)
1793 {
1794         if (!(rt->dst.tclassid & 0xFFFF))
1795                 rt->dst.tclassid |= tag & 0xFFFF;
1796         if (!(rt->dst.tclassid & 0xFFFF0000))
1797                 rt->dst.tclassid |= tag & 0xFFFF0000;
1798 }
1799 #endif
1800
1801 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1802 {
1803         struct fib_info *fi = res->fi;
1804
1805         if (fi) {
1806                 if (FIB_RES_GW(*res) &&
1807                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808                         rt->rt_gateway = FIB_RES_GW(*res);
1809                 memcpy(rt->dst.metrics, fi->fib_metrics,
1810                        sizeof(rt->dst.metrics));
1811                 if (fi->fib_mtu == 0) {
1812                         rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813                         if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814                             rt->rt_gateway != rt->rt_dst &&
1815                             rt->dst.dev->mtu > 576)
1816                                 rt->dst.metrics[RTAX_MTU-1] = 576;
1817                 }
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1820 #endif
1821         } else
1822                 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1823
1824         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1825                 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1826         if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1827                 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1828         if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1829                 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1830                                        ip_rt_min_advmss);
1831         if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832                 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1833
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 #ifdef CONFIG_IP_MULTIPLE_TABLES
1836         set_class_tag(rt, fib_rules_tclass(res));
1837 #endif
1838         set_class_tag(rt, itag);
1839 #endif
1840         rt->rt_type = res->type;
1841 }
1842
1843 /* called in rcu_read_lock() section */
1844 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1845                                 u8 tos, struct net_device *dev, int our)
1846 {
1847         unsigned int hash;
1848         struct rtable *rth;
1849         __be32 spec_dst;
1850         struct in_device *in_dev = __in_dev_get_rcu(dev);
1851         u32 itag = 0;
1852         int err;
1853
1854         /* Primary sanity checks. */
1855
1856         if (in_dev == NULL)
1857                 return -EINVAL;
1858
1859         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1860             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1861                 goto e_inval;
1862
1863         if (ipv4_is_zeronet(saddr)) {
1864                 if (!ipv4_is_local_multicast(daddr))
1865                         goto e_inval;
1866                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867         } else {
1868                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869                                           &itag, 0);
1870                 if (err < 0)
1871                         goto e_err;
1872         }
1873         rth = dst_alloc(&ipv4_dst_ops);
1874         if (!rth)
1875                 goto e_nobufs;
1876
1877         rth->dst.output = ip_rt_bug;
1878         rth->dst.obsolete = -1;
1879
1880         atomic_set(&rth->dst.__refcnt, 1);
1881         rth->dst.flags= DST_HOST;
1882         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883                 rth->dst.flags |= DST_NOPOLICY;
1884         rth->fl.fl4_dst = daddr;
1885         rth->rt_dst     = daddr;
1886         rth->fl.fl4_tos = tos;
1887         rth->fl.mark    = skb->mark;
1888         rth->fl.fl4_src = saddr;
1889         rth->rt_src     = saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891         rth->dst.tclassid = itag;
1892 #endif
1893         rth->rt_iif     =
1894         rth->fl.iif     = dev->ifindex;
1895         rth->dst.dev    = init_net.loopback_dev;
1896         dev_hold(rth->dst.dev);
1897         rth->idev       = in_dev_get(rth->dst.dev);
1898         rth->fl.oif     = 0;
1899         rth->rt_gateway = daddr;
1900         rth->rt_spec_dst= spec_dst;
1901         rth->rt_genid   = rt_genid(dev_net(dev));
1902         rth->rt_flags   = RTCF_MULTICAST;
1903         rth->rt_type    = RTN_MULTICAST;
1904         if (our) {
1905                 rth->dst.input= ip_local_deliver;
1906                 rth->rt_flags |= RTCF_LOCAL;
1907         }
1908
1909 #ifdef CONFIG_IP_MROUTE
1910         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911                 rth->dst.input = ip_mr_input;
1912 #endif
1913         RT_CACHE_STAT_INC(in_slow_mc);
1914
1915         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1917
1918 e_nobufs:
1919         return -ENOBUFS;
1920 e_inval:
1921         return -EINVAL;
1922 e_err:
1923         return err;
1924 }
1925
1926
1927 static void ip_handle_martian_source(struct net_device *dev,
1928                                      struct in_device *in_dev,
1929                                      struct sk_buff *skb,
1930                                      __be32 daddr,
1931                                      __be32 saddr)
1932 {
1933         RT_CACHE_STAT_INC(in_martian_src);
1934 #ifdef CONFIG_IP_ROUTE_VERBOSE
1935         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1936                 /*
1937                  *      RFC1812 recommendation, if source is martian,
1938                  *      the only hint is MAC header.
1939                  */
1940                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1941                         &daddr, &saddr, dev->name);
1942                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1943                         int i;
1944                         const unsigned char *p = skb_mac_header(skb);
1945                         printk(KERN_WARNING "ll header: ");
1946                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1947                                 printk("%02x", *p);
1948                                 if (i < (dev->hard_header_len - 1))
1949                                         printk(":");
1950                         }
1951                         printk("\n");
1952                 }
1953         }
1954 #endif
1955 }
1956
1957 /* called in rcu_read_lock() section */
1958 static int __mkroute_input(struct sk_buff *skb,
1959                            struct fib_result *res,
1960                            struct in_device *in_dev,
1961                            __be32 daddr, __be32 saddr, u32 tos,
1962                            struct rtable **result)
1963 {
1964         struct rtable *rth;
1965         int err;
1966         struct in_device *out_dev;
1967         unsigned int flags = 0;
1968         __be32 spec_dst;
1969         u32 itag;
1970
1971         /* get a working reference to the output device */
1972         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1973         if (out_dev == NULL) {
1974                 if (net_ratelimit())
1975                         printk(KERN_CRIT "Bug in ip_route_input" \
1976                                "_slow(). Please, report\n");
1977                 return -EINVAL;
1978         }
1979
1980
1981         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1982                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1983         if (err < 0) {
1984                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1985                                          saddr);
1986
1987                 goto cleanup;
1988         }
1989
1990         if (err)
1991                 flags |= RTCF_DIRECTSRC;
1992
1993         if (out_dev == in_dev && err &&
1994             (IN_DEV_SHARED_MEDIA(out_dev) ||
1995              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1996                 flags |= RTCF_DOREDIRECT;
1997
1998         if (skb->protocol != htons(ETH_P_IP)) {
1999                 /* Not IP (i.e. ARP). Do not create route, if it is
2000                  * invalid for proxy arp. DNAT routes are always valid.
2001                  *
2002                  * Proxy arp feature have been extended to allow, ARP
2003                  * replies back to the same interface, to support
2004                  * Private VLAN switch technologies. See arp.c.
2005                  */
2006                 if (out_dev == in_dev &&
2007                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2008                         err = -EINVAL;
2009                         goto cleanup;
2010                 }
2011         }
2012
2013
2014         rth = dst_alloc(&ipv4_dst_ops);
2015         if (!rth) {
2016                 err = -ENOBUFS;
2017                 goto cleanup;
2018         }
2019
2020         atomic_set(&rth->dst.__refcnt, 1);
2021         rth->dst.flags= DST_HOST;
2022         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2023                 rth->dst.flags |= DST_NOPOLICY;
2024         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2025                 rth->dst.flags |= DST_NOXFRM;
2026         rth->fl.fl4_dst = daddr;
2027         rth->rt_dst     = daddr;
2028         rth->fl.fl4_tos = tos;
2029         rth->fl.mark    = skb->mark;
2030         rth->fl.fl4_src = saddr;
2031         rth->rt_src     = saddr;
2032         rth->rt_gateway = daddr;
2033         rth->rt_iif     =
2034                 rth->fl.iif     = in_dev->dev->ifindex;
2035         rth->dst.dev    = (out_dev)->dev;
2036         dev_hold(rth->dst.dev);
2037         rth->idev       = in_dev_get(rth->dst.dev);
2038         rth->fl.oif     = 0;
2039         rth->rt_spec_dst= spec_dst;
2040
2041         rth->dst.obsolete = -1;
2042         rth->dst.input = ip_forward;
2043         rth->dst.output = ip_output;
2044         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2045
2046         rt_set_nexthop(rth, res, itag);
2047
2048         rth->rt_flags = flags;
2049
2050         *result = rth;
2051         err = 0;
2052  cleanup:
2053         return err;
2054 }
2055
2056 static int ip_mkroute_input(struct sk_buff *skb,
2057                             struct fib_result *res,
2058                             const struct flowi *fl,
2059                             struct in_device *in_dev,
2060                             __be32 daddr, __be32 saddr, u32 tos)
2061 {
2062         struct rtable* rth = NULL;
2063         int err;
2064         unsigned hash;
2065
2066 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2067         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2068                 fib_select_multipath(fl, res);
2069 #endif
2070
2071         /* create a routing cache entry */
2072         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2073         if (err)
2074                 return err;
2075
2076         /* put it into the cache */
2077         hash = rt_hash(daddr, saddr, fl->iif,
2078                        rt_genid(dev_net(rth->dst.dev)));
2079         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2080 }
2081
2082 /*
2083  *      NOTE. We drop all the packets that has local source
2084  *      addresses, because every properly looped back packet
2085  *      must have correct destination already attached by output routine.
2086  *
2087  *      Such approach solves two big problems:
2088  *      1. Not simplex devices are handled properly.
2089  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2090  *      called with rcu_read_lock()
2091  */
2092
2093 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                                u8 tos, struct net_device *dev)
2095 {
2096         struct fib_result res;
2097         struct in_device *in_dev = __in_dev_get_rcu(dev);
2098         struct flowi fl = { .nl_u = { .ip4_u =
2099                                       { .daddr = daddr,
2100                                         .saddr = saddr,
2101                                         .tos = tos,
2102                                         .scope = RT_SCOPE_UNIVERSE,
2103                                       } },
2104                             .mark = skb->mark,
2105                             .iif = dev->ifindex };
2106         unsigned        flags = 0;
2107         u32             itag = 0;
2108         struct rtable * rth;
2109         unsigned        hash;
2110         __be32          spec_dst;
2111         int             err = -EINVAL;
2112         struct net    * net = dev_net(dev);
2113
2114         /* IP on this device is disabled. */
2115
2116         if (!in_dev)
2117                 goto out;
2118
2119         /* Check for the most weird martians, which can be not detected
2120            by fib_lookup.
2121          */
2122
2123         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2124             ipv4_is_loopback(saddr))
2125                 goto martian_source;
2126
2127         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2128                 goto brd_input;
2129
2130         /* Accept zero addresses only to limited broadcast;
2131          * I even do not know to fix it or not. Waiting for complains :-)
2132          */
2133         if (ipv4_is_zeronet(saddr))
2134                 goto martian_source;
2135
2136         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2137             ipv4_is_loopback(daddr))
2138                 goto martian_destination;
2139
2140         /*
2141          *      Now we are ready to route packet.
2142          */
2143         err = fib_lookup(net, &fl, &res);
2144         if (err != 0) {
2145                 if (!IN_DEV_FORWARD(in_dev))
2146                         goto e_hostunreach;
2147                 goto no_route;
2148         }
2149
2150         RT_CACHE_STAT_INC(in_slow_tot);
2151
2152         if (res.type == RTN_BROADCAST)
2153                 goto brd_input;
2154
2155         if (res.type == RTN_LOCAL) {
2156                 err = fib_validate_source(saddr, daddr, tos,
2157                                           net->loopback_dev->ifindex,
2158                                           dev, &spec_dst, &itag, skb->mark);
2159                 if (err < 0)
2160                         goto martian_source_keep_err;
2161                 if (err)
2162                         flags |= RTCF_DIRECTSRC;
2163                 spec_dst = daddr;
2164                 goto local_input;
2165         }
2166
2167         if (!IN_DEV_FORWARD(in_dev))
2168                 goto e_hostunreach;
2169         if (res.type != RTN_UNICAST)
2170                 goto martian_destination;
2171
2172         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2173 out:    return err;
2174
2175 brd_input:
2176         if (skb->protocol != htons(ETH_P_IP))
2177                 goto e_inval;
2178
2179         if (ipv4_is_zeronet(saddr))
2180                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2181         else {
2182                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2183                                           &itag, skb->mark);
2184                 if (err < 0)
2185                         goto martian_source_keep_err;
2186                 if (err)
2187                         flags |= RTCF_DIRECTSRC;
2188         }
2189         flags |= RTCF_BROADCAST;
2190         res.type = RTN_BROADCAST;
2191         RT_CACHE_STAT_INC(in_brd);
2192
2193 local_input:
2194         rth = dst_alloc(&ipv4_dst_ops);
2195         if (!rth)
2196                 goto e_nobufs;
2197
2198         rth->dst.output= ip_rt_bug;
2199         rth->dst.obsolete = -1;
2200         rth->rt_genid = rt_genid(net);
2201
2202         atomic_set(&rth->dst.__refcnt, 1);
2203         rth->dst.flags= DST_HOST;
2204         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2205                 rth->dst.flags |= DST_NOPOLICY;
2206         rth->fl.fl4_dst = daddr;
2207         rth->rt_dst     = daddr;
2208         rth->fl.fl4_tos = tos;
2209         rth->fl.mark    = skb->mark;
2210         rth->fl.fl4_src = saddr;
2211         rth->rt_src     = saddr;
2212 #ifdef CONFIG_NET_CLS_ROUTE
2213         rth->dst.tclassid = itag;
2214 #endif
2215         rth->rt_iif     =
2216         rth->fl.iif     = dev->ifindex;
2217         rth->dst.dev    = net->loopback_dev;
2218         dev_hold(rth->dst.dev);
2219         rth->idev       = in_dev_get(rth->dst.dev);
2220         rth->rt_gateway = daddr;
2221         rth->rt_spec_dst= spec_dst;
2222         rth->dst.input= ip_local_deliver;
2223         rth->rt_flags   = flags|RTCF_LOCAL;
2224         if (res.type == RTN_UNREACHABLE) {
2225                 rth->dst.input= ip_error;
2226                 rth->dst.error= -err;
2227                 rth->rt_flags   &= ~RTCF_LOCAL;
2228         }
2229         rth->rt_type    = res.type;
2230         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2231         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2232         goto out;
2233
2234 no_route:
2235         RT_CACHE_STAT_INC(in_no_route);
2236         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2237         res.type = RTN_UNREACHABLE;
2238         if (err == -ESRCH)
2239                 err = -ENETUNREACH;
2240         goto local_input;
2241
2242         /*
2243          *      Do not cache martian addresses: they should be logged (RFC1812)
2244          */
2245 martian_destination:
2246         RT_CACHE_STAT_INC(in_martian_dst);
2247 #ifdef CONFIG_IP_ROUTE_VERBOSE
2248         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2249                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2250                         &daddr, &saddr, dev->name);
2251 #endif
2252
2253 e_hostunreach:
2254         err = -EHOSTUNREACH;
2255         goto out;
2256
2257 e_inval:
2258         err = -EINVAL;
2259         goto out;
2260
2261 e_nobufs:
2262         err = -ENOBUFS;
2263         goto out;
2264
2265 martian_source:
2266         err = -EINVAL;
2267 martian_source_keep_err:
2268         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2269         goto out;
2270 }
2271
2272 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2273                            u8 tos, struct net_device *dev, bool noref)
2274 {
2275         struct rtable * rth;
2276         unsigned        hash;
2277         int iif = dev->ifindex;
2278         struct net *net;
2279         int res;
2280
2281         net = dev_net(dev);
2282
2283         rcu_read_lock();
2284
2285         if (!rt_caching(net))
2286                 goto skip_cache;
2287
2288         tos &= IPTOS_RT_MASK;
2289         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2290
2291         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2292              rth = rcu_dereference(rth->dst.rt_next)) {
2293                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2294                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2295                      (rth->fl.iif ^ iif) |
2296                      rth->fl.oif |
2297                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2298                     rth->fl.mark == skb->mark &&
2299                     net_eq(dev_net(rth->dst.dev), net) &&
2300                     !rt_is_expired(rth)) {
2301                         if (noref) {
2302                                 dst_use_noref(&rth->dst, jiffies);
2303                                 skb_dst_set_noref(skb, &rth->dst);
2304                         } else {
2305                                 dst_use(&rth->dst, jiffies);
2306                                 skb_dst_set(skb, &rth->dst);
2307                         }
2308                         RT_CACHE_STAT_INC(in_hit);
2309                         rcu_read_unlock();
2310                         return 0;
2311                 }
2312                 RT_CACHE_STAT_INC(in_hlist_search);
2313         }
2314
2315 skip_cache:
2316         /* Multicast recognition logic is moved from route cache to here.
2317            The problem was that too many Ethernet cards have broken/missing
2318            hardware multicast filters :-( As result the host on multicasting
2319            network acquires a lot of useless route cache entries, sort of
2320            SDR messages from all the world. Now we try to get rid of them.
2321            Really, provided software IP multicast filter is organized
2322            reasonably (at least, hashed), it does not result in a slowdown
2323            comparing with route cache reject entries.
2324            Note, that multicast routers are not affected, because
2325            route cache entry is created eventually.
2326          */
2327         if (ipv4_is_multicast(daddr)) {
2328                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2329
2330                 if (in_dev) {
2331                         int our = ip_check_mc(in_dev, daddr, saddr,
2332                                               ip_hdr(skb)->protocol);
2333                         if (our
2334 #ifdef CONFIG_IP_MROUTE
2335                                 ||
2336                             (!ipv4_is_local_multicast(daddr) &&
2337                              IN_DEV_MFORWARD(in_dev))
2338 #endif
2339                            ) {
2340                                 int res = ip_route_input_mc(skb, daddr, saddr,
2341                                                             tos, dev, our);
2342                                 rcu_read_unlock();
2343                                 return res;
2344                         }
2345                 }
2346                 rcu_read_unlock();
2347                 return -EINVAL;
2348         }
2349         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2350         rcu_read_unlock();
2351         return res;
2352 }
2353 EXPORT_SYMBOL(ip_route_input_common);
2354
2355 /* called with rcu_read_lock() */
2356 static int __mkroute_output(struct rtable **result,
2357                             struct fib_result *res,
2358                             const struct flowi *fl,
2359                             const struct flowi *oldflp,
2360                             struct net_device *dev_out,
2361                             unsigned flags)
2362 {
2363         struct rtable *rth;
2364         struct in_device *in_dev;
2365         u32 tos = RT_FL_TOS(oldflp);
2366
2367         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2368                 return -EINVAL;
2369
2370         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2371                 res->type = RTN_BROADCAST;
2372         else if (ipv4_is_multicast(fl->fl4_dst))
2373                 res->type = RTN_MULTICAST;
2374         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2375                 return -EINVAL;
2376
2377         if (dev_out->flags & IFF_LOOPBACK)
2378                 flags |= RTCF_LOCAL;
2379
2380         in_dev = __in_dev_get_rcu(dev_out);
2381         if (!in_dev)
2382                 return -EINVAL;
2383
2384         if (res->type == RTN_BROADCAST) {
2385                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2386                 res->fi = NULL;
2387         } else if (res->type == RTN_MULTICAST) {
2388                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2389                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2390                                  oldflp->proto))
2391                         flags &= ~RTCF_LOCAL;
2392                 /* If multicast route do not exist use
2393                  * default one, but do not gateway in this case.
2394                  * Yes, it is hack.
2395                  */
2396                 if (res->fi && res->prefixlen < 4)
2397                         res->fi = NULL;
2398         }
2399
2400
2401         rth = dst_alloc(&ipv4_dst_ops);
2402         if (!rth)
2403                 return -ENOBUFS;
2404
2405         in_dev_hold(in_dev);
2406         rth->idev = in_dev;
2407
2408         atomic_set(&rth->dst.__refcnt, 1);
2409         rth->dst.flags= DST_HOST;
2410         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2411                 rth->dst.flags |= DST_NOXFRM;
2412         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2413                 rth->dst.flags |= DST_NOPOLICY;
2414
2415         rth->fl.fl4_dst = oldflp->fl4_dst;
2416         rth->fl.fl4_tos = tos;
2417         rth->fl.fl4_src = oldflp->fl4_src;
2418         rth->fl.oif     = oldflp->oif;
2419         rth->fl.mark    = oldflp->mark;
2420         rth->rt_dst     = fl->fl4_dst;
2421         rth->rt_src     = fl->fl4_src;
2422         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2423         /* get references to the devices that are to be hold by the routing
2424            cache entry */
2425         rth->dst.dev    = dev_out;
2426         dev_hold(dev_out);
2427         rth->rt_gateway = fl->fl4_dst;
2428         rth->rt_spec_dst= fl->fl4_src;
2429
2430         rth->dst.output=ip_output;
2431         rth->dst.obsolete = -1;
2432         rth->rt_genid = rt_genid(dev_net(dev_out));
2433
2434         RT_CACHE_STAT_INC(out_slow_tot);
2435
2436         if (flags & RTCF_LOCAL) {
2437                 rth->dst.input = ip_local_deliver;
2438                 rth->rt_spec_dst = fl->fl4_dst;
2439         }
2440         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2441                 rth->rt_spec_dst = fl->fl4_src;
2442                 if (flags & RTCF_LOCAL &&
2443                     !(dev_out->flags & IFF_LOOPBACK)) {
2444                         rth->dst.output = ip_mc_output;
2445                         RT_CACHE_STAT_INC(out_slow_mc);
2446                 }
2447 #ifdef CONFIG_IP_MROUTE
2448                 if (res->type == RTN_MULTICAST) {
2449                         if (IN_DEV_MFORWARD(in_dev) &&
2450                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2451                                 rth->dst.input = ip_mr_input;
2452                                 rth->dst.output = ip_mc_output;
2453                         }
2454                 }
2455 #endif
2456         }
2457
2458         rt_set_nexthop(rth, res, 0);
2459
2460         rth->rt_flags = flags;
2461         *result = rth;
2462         return 0;
2463 }
2464
2465 /* called with rcu_read_lock() */
2466 static int ip_mkroute_output(struct rtable **rp,
2467                              struct fib_result *res,
2468                              const struct flowi *fl,
2469                              const struct flowi *oldflp,
2470                              struct net_device *dev_out,
2471                              unsigned flags)
2472 {
2473         struct rtable *rth = NULL;
2474         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2475         unsigned hash;
2476         if (err == 0) {
2477                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2478                                rt_genid(dev_net(dev_out)));
2479                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2480         }
2481
2482         return err;
2483 }
2484
2485 /*
2486  * Major route resolver routine.
2487  * called with rcu_read_lock();
2488  */
2489
2490 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2491                                 const struct flowi *oldflp)
2492 {
2493         u32 tos = RT_FL_TOS(oldflp);
2494         struct flowi fl = { .nl_u = { .ip4_u =
2495                                       { .daddr = oldflp->fl4_dst,
2496                                         .saddr = oldflp->fl4_src,
2497                                         .tos = tos & IPTOS_RT_MASK,
2498                                         .scope = ((tos & RTO_ONLINK) ?
2499                                                   RT_SCOPE_LINK :
2500                                                   RT_SCOPE_UNIVERSE),
2501                                       } },
2502                             .mark = oldflp->mark,
2503                             .iif = net->loopback_dev->ifindex,
2504                             .oif = oldflp->oif };
2505         struct fib_result res;
2506         unsigned int flags = 0;
2507         struct net_device *dev_out = NULL;
2508         int err;
2509
2510
2511         res.fi          = NULL;
2512 #ifdef CONFIG_IP_MULTIPLE_TABLES
2513         res.r           = NULL;
2514 #endif
2515
2516         if (oldflp->fl4_src) {
2517                 err = -EINVAL;
2518                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2519                     ipv4_is_lbcast(oldflp->fl4_src) ||
2520                     ipv4_is_zeronet(oldflp->fl4_src))
2521                         goto out;
2522
2523                 /* I removed check for oif == dev_out->oif here.
2524                    It was wrong for two reasons:
2525                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2526                       is assigned to multiple interfaces.
2527                    2. Moreover, we are allowed to send packets with saddr
2528                       of another iface. --ANK
2529                  */
2530
2531                 if (oldflp->oif == 0 &&
2532                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2533                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2534                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2535                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2536                         if (dev_out == NULL)
2537                                 goto out;
2538
2539                         /* Special hack: user can direct multicasts
2540                            and limited broadcast via necessary interface
2541                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2542                            This hack is not just for fun, it allows
2543                            vic,vat and friends to work.
2544                            They bind socket to loopback, set ttl to zero
2545                            and expect that it will work.
2546                            From the viewpoint of routing cache they are broken,
2547                            because we are not allowed to build multicast path
2548                            with loopback source addr (look, routing cache
2549                            cannot know, that ttl is zero, so that packet
2550                            will not leave this host and route is valid).
2551                            Luckily, this hack is good workaround.
2552                          */
2553
2554                         fl.oif = dev_out->ifindex;
2555                         goto make_route;
2556                 }
2557
2558                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2559                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2560                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2561                                 goto out;
2562                 }
2563         }
2564
2565
2566         if (oldflp->oif) {
2567                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2568                 err = -ENODEV;
2569                 if (dev_out == NULL)
2570                         goto out;
2571
2572                 /* RACE: Check return value of inet_select_addr instead. */
2573                 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2574                         goto out;       /* Wrong error code */
2575
2576                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2577                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2578                         if (!fl.fl4_src)
2579                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2580                                                               RT_SCOPE_LINK);
2581                         goto make_route;
2582                 }
2583                 if (!fl.fl4_src) {
2584                         if (ipv4_is_multicast(oldflp->fl4_dst))
2585                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2586                                                               fl.fl4_scope);
2587                         else if (!oldflp->fl4_dst)
2588                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2589                                                               RT_SCOPE_HOST);
2590                 }
2591         }
2592
2593         if (!fl.fl4_dst) {
2594                 fl.fl4_dst = fl.fl4_src;
2595                 if (!fl.fl4_dst)
2596                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2597                 dev_out = net->loopback_dev;
2598                 fl.oif = net->loopback_dev->ifindex;
2599                 res.type = RTN_LOCAL;
2600                 flags |= RTCF_LOCAL;
2601                 goto make_route;
2602         }
2603
2604         if (fib_lookup(net, &fl, &res)) {
2605                 res.fi = NULL;
2606                 if (oldflp->oif) {
2607                         /* Apparently, routing tables are wrong. Assume,
2608                            that the destination is on link.
2609
2610                            WHY? DW.
2611                            Because we are allowed to send to iface
2612                            even if it has NO routes and NO assigned
2613                            addresses. When oif is specified, routing
2614                            tables are looked up with only one purpose:
2615                            to catch if destination is gatewayed, rather than
2616                            direct. Moreover, if MSG_DONTROUTE is set,
2617                            we send packet, ignoring both routing tables
2618                            and ifaddr state. --ANK
2619
2620
2621                            We could make it even if oif is unknown,
2622                            likely IPv6, but we do not.
2623                          */
2624
2625                         if (fl.fl4_src == 0)
2626                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2627                                                               RT_SCOPE_LINK);
2628                         res.type = RTN_UNICAST;
2629                         goto make_route;
2630                 }
2631                 err = -ENETUNREACH;
2632                 goto out;
2633         }
2634
2635         if (res.type == RTN_LOCAL) {
2636                 if (!fl.fl4_src)
2637                         fl.fl4_src = fl.fl4_dst;
2638                 dev_out = net->loopback_dev;
2639                 fl.oif = dev_out->ifindex;
2640                 res.fi = NULL;
2641                 flags |= RTCF_LOCAL;
2642                 goto make_route;
2643         }
2644
2645 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2646         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2647                 fib_select_multipath(&fl, &res);
2648         else
2649 #endif
2650         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2651                 fib_select_default(net, &fl, &res);
2652
2653         if (!fl.fl4_src)
2654                 fl.fl4_src = FIB_RES_PREFSRC(res);
2655
2656         dev_out = FIB_RES_DEV(res);
2657         fl.oif = dev_out->ifindex;
2658
2659
2660 make_route:
2661         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2662
2663 out:    return err;
2664 }
2665
2666 int __ip_route_output_key(struct net *net, struct rtable **rp,
2667                           const struct flowi *flp)
2668 {
2669         unsigned int hash;
2670         int res;
2671         struct rtable *rth;
2672
2673         if (!rt_caching(net))
2674                 goto slow_output;
2675
2676         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2677
2678         rcu_read_lock_bh();
2679         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2680                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2681                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2682                     rth->fl.fl4_src == flp->fl4_src &&
2683                     rth->fl.iif == 0 &&
2684                     rth->fl.oif == flp->oif &&
2685                     rth->fl.mark == flp->mark &&
2686                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2687                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2688                     net_eq(dev_net(rth->dst.dev), net) &&
2689                     !rt_is_expired(rth)) {
2690                         dst_use(&rth->dst, jiffies);
2691                         RT_CACHE_STAT_INC(out_hit);
2692                         rcu_read_unlock_bh();
2693                         *rp = rth;
2694                         return 0;
2695                 }
2696                 RT_CACHE_STAT_INC(out_hlist_search);
2697         }
2698         rcu_read_unlock_bh();
2699
2700 slow_output:
2701         rcu_read_lock();
2702         res = ip_route_output_slow(net, rp, flp);
2703         rcu_read_unlock();
2704         return res;
2705 }
2706 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2707
2708 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2709 {
2710         return NULL;
2711 }
2712
2713 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2714 {
2715 }
2716
2717 static struct dst_ops ipv4_dst_blackhole_ops = {
2718         .family                 =       AF_INET,
2719         .protocol               =       cpu_to_be16(ETH_P_IP),
2720         .destroy                =       ipv4_dst_destroy,
2721         .check                  =       ipv4_blackhole_dst_check,
2722         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2723 };
2724
2725
2726 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2727 {
2728         struct rtable *ort = *rp;
2729         struct rtable *rt = (struct rtable *)
2730                 dst_alloc(&ipv4_dst_blackhole_ops);
2731
2732         if (rt) {
2733                 struct dst_entry *new = &rt->dst;
2734
2735                 atomic_set(&new->__refcnt, 1);
2736                 new->__use = 1;
2737                 new->input = dst_discard;
2738                 new->output = dst_discard;
2739                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2740
2741                 new->dev = ort->dst.dev;
2742                 if (new->dev)
2743                         dev_hold(new->dev);
2744
2745                 rt->fl = ort->fl;
2746
2747                 rt->idev = ort->idev;
2748                 if (rt->idev)
2749                         in_dev_hold(rt->idev);
2750                 rt->rt_genid = rt_genid(net);
2751                 rt->rt_flags = ort->rt_flags;
2752                 rt->rt_type = ort->rt_type;
2753                 rt->rt_dst = ort->rt_dst;
2754                 rt->rt_src = ort->rt_src;
2755                 rt->rt_iif = ort->rt_iif;
2756                 rt->rt_gateway = ort->rt_gateway;
2757                 rt->rt_spec_dst = ort->rt_spec_dst;
2758                 rt->peer = ort->peer;
2759                 if (rt->peer)
2760                         atomic_inc(&rt->peer->refcnt);
2761
2762                 dst_free(new);
2763         }
2764
2765         dst_release(&(*rp)->dst);
2766         *rp = rt;
2767         return rt ? 0 : -ENOMEM;
2768 }
2769
2770 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2771                          struct sock *sk, int flags)
2772 {
2773         int err;
2774
2775         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2776                 return err;
2777
2778         if (flp->proto) {
2779                 if (!flp->fl4_src)
2780                         flp->fl4_src = (*rp)->rt_src;
2781                 if (!flp->fl4_dst)
2782                         flp->fl4_dst = (*rp)->rt_dst;
2783                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2784                                     flags ? XFRM_LOOKUP_WAIT : 0);
2785                 if (err == -EREMOTE)
2786                         err = ipv4_dst_blackhole(net, rp, flp);
2787
2788                 return err;
2789         }
2790
2791         return 0;
2792 }
2793 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2794
2795 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2796 {
2797         return ip_route_output_flow(net, rp, flp, NULL, 0);
2798 }
2799 EXPORT_SYMBOL(ip_route_output_key);
2800
2801 static int rt_fill_info(struct net *net,
2802                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2803                         int nowait, unsigned int flags)
2804 {
2805         struct rtable *rt = skb_rtable(skb);
2806         struct rtmsg *r;
2807         struct nlmsghdr *nlh;
2808         long expires;
2809         u32 id = 0, ts = 0, tsage = 0, error;
2810
2811         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2812         if (nlh == NULL)
2813                 return -EMSGSIZE;
2814
2815         r = nlmsg_data(nlh);
2816         r->rtm_family    = AF_INET;
2817         r->rtm_dst_len  = 32;
2818         r->rtm_src_len  = 0;
2819         r->rtm_tos      = rt->fl.fl4_tos;
2820         r->rtm_table    = RT_TABLE_MAIN;
2821         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2822         r->rtm_type     = rt->rt_type;
2823         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2824         r->rtm_protocol = RTPROT_UNSPEC;
2825         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2826         if (rt->rt_flags & RTCF_NOTIFY)
2827                 r->rtm_flags |= RTM_F_NOTIFY;
2828
2829         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2830
2831         if (rt->fl.fl4_src) {
2832                 r->rtm_src_len = 32;
2833                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2834         }
2835         if (rt->dst.dev)
2836                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2837 #ifdef CONFIG_NET_CLS_ROUTE
2838         if (rt->dst.tclassid)
2839                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2840 #endif
2841         if (rt->fl.iif)
2842                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2843         else if (rt->rt_src != rt->fl.fl4_src)
2844                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2845
2846         if (rt->rt_dst != rt->rt_gateway)
2847                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2848
2849         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2850                 goto nla_put_failure;
2851
2852         if (rt->fl.mark)
2853                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2854
2855         error = rt->dst.error;
2856         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2857         if (rt->peer) {
2858                 inet_peer_refcheck(rt->peer);
2859                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2860                 if (rt->peer->tcp_ts_stamp) {
2861                         ts = rt->peer->tcp_ts;
2862                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2863                 }
2864         }
2865
2866         if (rt->fl.iif) {
2867 #ifdef CONFIG_IP_MROUTE
2868                 __be32 dst = rt->rt_dst;
2869
2870                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2871                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2872                         int err = ipmr_get_route(net, skb, r, nowait);
2873                         if (err <= 0) {
2874                                 if (!nowait) {
2875                                         if (err == 0)
2876                                                 return 0;
2877                                         goto nla_put_failure;
2878                                 } else {
2879                                         if (err == -EMSGSIZE)
2880                                                 goto nla_put_failure;
2881                                         error = err;
2882                                 }
2883                         }
2884                 } else
2885 #endif
2886                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2887         }
2888
2889         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2890                                expires, error) < 0)
2891                 goto nla_put_failure;
2892
2893         return nlmsg_end(skb, nlh);
2894
2895 nla_put_failure:
2896         nlmsg_cancel(skb, nlh);
2897         return -EMSGSIZE;
2898 }
2899
2900 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2901 {
2902         struct net *net = sock_net(in_skb->sk);
2903         struct rtmsg *rtm;
2904         struct nlattr *tb[RTA_MAX+1];
2905         struct rtable *rt = NULL;
2906         __be32 dst = 0;
2907         __be32 src = 0;
2908         u32 iif;
2909         int err;
2910         int mark;
2911         struct sk_buff *skb;
2912
2913         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2914         if (err < 0)
2915                 goto errout;
2916
2917         rtm = nlmsg_data(nlh);
2918
2919         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2920         if (skb == NULL) {
2921                 err = -ENOBUFS;
2922                 goto errout;
2923         }
2924
2925         /* Reserve room for dummy headers, this skb can pass
2926            through good chunk of routing engine.
2927          */
2928         skb_reset_mac_header(skb);
2929         skb_reset_network_header(skb);
2930
2931         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2932         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2933         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2934
2935         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2936         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2937         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2938         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2939
2940         if (iif) {
2941                 struct net_device *dev;
2942
2943                 dev = __dev_get_by_index(net, iif);
2944                 if (dev == NULL) {
2945                         err = -ENODEV;
2946                         goto errout_free;
2947                 }
2948
2949                 skb->protocol   = htons(ETH_P_IP);
2950                 skb->dev        = dev;
2951                 skb->mark       = mark;
2952                 local_bh_disable();
2953                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954                 local_bh_enable();
2955
2956                 rt = skb_rtable(skb);
2957                 if (err == 0 && rt->dst.error)
2958                         err = -rt->dst.error;
2959         } else {
2960                 struct flowi fl = {
2961                         .nl_u = {
2962                                 .ip4_u = {
2963                                         .daddr = dst,
2964                                         .saddr = src,
2965                                         .tos = rtm->rtm_tos,
2966                                 },
2967                         },
2968                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2969                         .mark = mark,
2970                 };
2971                 err = ip_route_output_key(net, &rt, &fl);
2972         }
2973
2974         if (err)
2975                 goto errout_free;
2976
2977         skb_dst_set(skb, &rt->dst);
2978         if (rtm->rtm_flags & RTM_F_NOTIFY)
2979                 rt->rt_flags |= RTCF_NOTIFY;
2980
2981         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2982                            RTM_NEWROUTE, 0, 0);
2983         if (err <= 0)
2984                 goto errout_free;
2985
2986         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2987 errout:
2988         return err;
2989
2990 errout_free:
2991         kfree_skb(skb);
2992         goto errout;
2993 }
2994
2995 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2996 {
2997         struct rtable *rt;
2998         int h, s_h;
2999         int idx, s_idx;
3000         struct net *net;
3001
3002         net = sock_net(skb->sk);
3003
3004         s_h = cb->args[0];
3005         if (s_h < 0)
3006                 s_h = 0;
3007         s_idx = idx = cb->args[1];
3008         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3009                 if (!rt_hash_table[h].chain)
3010                         continue;
3011                 rcu_read_lock_bh();
3012                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3013                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3014                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3015                                 continue;
3016                         if (rt_is_expired(rt))
3017                                 continue;
3018                         skb_dst_set_noref(skb, &rt->dst);
3019                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3020                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3021                                          1, NLM_F_MULTI) <= 0) {
3022                                 skb_dst_drop(skb);
3023                                 rcu_read_unlock_bh();
3024                                 goto done;
3025                         }
3026                         skb_dst_drop(skb);
3027                 }
3028                 rcu_read_unlock_bh();
3029         }
3030
3031 done:
3032         cb->args[0] = h;
3033         cb->args[1] = idx;
3034         return skb->len;
3035 }
3036
3037 void ip_rt_multicast_event(struct in_device *in_dev)
3038 {
3039         rt_cache_flush(dev_net(in_dev->dev), 0);
3040 }
3041
3042 #ifdef CONFIG_SYSCTL
3043 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3044                                         void __user *buffer,
3045                                         size_t *lenp, loff_t *ppos)
3046 {
3047         if (write) {
3048                 int flush_delay;
3049                 ctl_table ctl;
3050                 struct net *net;
3051
3052                 memcpy(&ctl, __ctl, sizeof(ctl));
3053                 ctl.data = &flush_delay;
3054                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3055
3056                 net = (struct net *)__ctl->extra1;
3057                 rt_cache_flush(net, flush_delay);
3058                 return 0;
3059         }
3060
3061         return -EINVAL;
3062 }
3063
3064 static ctl_table ipv4_route_table[] = {
3065         {
3066                 .procname       = "gc_thresh",
3067                 .data           = &ipv4_dst_ops.gc_thresh,
3068                 .maxlen         = sizeof(int),
3069                 .mode           = 0644,
3070                 .proc_handler   = proc_dointvec,
3071         },
3072         {
3073                 .procname       = "max_size",
3074                 .data           = &ip_rt_max_size,
3075                 .maxlen         = sizeof(int),
3076                 .mode           = 0644,
3077                 .proc_handler   = proc_dointvec,
3078         },
3079         {
3080                 /*  Deprecated. Use gc_min_interval_ms */
3081
3082                 .procname       = "gc_min_interval",
3083                 .data           = &ip_rt_gc_min_interval,
3084                 .maxlen         = sizeof(int),
3085                 .mode           = 0644,
3086                 .proc_handler   = proc_dointvec_jiffies,
3087         },
3088         {
3089                 .procname       = "gc_min_interval_ms",
3090                 .data           = &ip_rt_gc_min_interval,
3091                 .maxlen         = sizeof(int),
3092                 .mode           = 0644,
3093                 .proc_handler   = proc_dointvec_ms_jiffies,
3094         },
3095         {
3096                 .procname       = "gc_timeout",
3097                 .data           = &ip_rt_gc_timeout,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = proc_dointvec_jiffies,
3101         },
3102         {
3103                 .procname       = "gc_interval",
3104                 .data           = &ip_rt_gc_interval,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec_jiffies,
3108         },
3109         {
3110                 .procname       = "redirect_load",
3111                 .data           = &ip_rt_redirect_load,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec,
3115         },
3116         {
3117                 .procname       = "redirect_number",
3118                 .data           = &ip_rt_redirect_number,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec,
3122         },
3123         {
3124                 .procname       = "redirect_silence",
3125                 .data           = &ip_rt_redirect_silence,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec,
3129         },
3130         {
3131                 .procname       = "error_cost",
3132                 .data           = &ip_rt_error_cost,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 .procname       = "error_burst",
3139                 .data           = &ip_rt_error_burst,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "gc_elasticity",
3146                 .data           = &ip_rt_gc_elasticity,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "mtu_expires",
3153                 .data           = &ip_rt_mtu_expires,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec_jiffies,
3157         },
3158         {
3159                 .procname       = "min_pmtu",
3160                 .data           = &ip_rt_min_pmtu,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "min_adv_mss",
3167                 .data           = &ip_rt_min_advmss,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         { }
3173 };
3174
3175 static struct ctl_table empty[1];
3176
3177 static struct ctl_table ipv4_skeleton[] =
3178 {
3179         { .procname = "route", 
3180           .mode = 0555, .child = ipv4_route_table},
3181         { .procname = "neigh", 
3182           .mode = 0555, .child = empty},
3183         { }
3184 };
3185
3186 static __net_initdata struct ctl_path ipv4_path[] = {
3187         { .procname = "net", },
3188         { .procname = "ipv4", },
3189         { },
3190 };
3191
3192 static struct ctl_table ipv4_route_flush_table[] = {
3193         {
3194                 .procname       = "flush",
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0200,
3197                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3198         },
3199         { },
3200 };
3201
3202 static __net_initdata struct ctl_path ipv4_route_path[] = {
3203         { .procname = "net", },
3204         { .procname = "ipv4", },
3205         { .procname = "route", },
3206         { },
3207 };
3208
3209 static __net_init int sysctl_route_net_init(struct net *net)
3210 {
3211         struct ctl_table *tbl;
3212
3213         tbl = ipv4_route_flush_table;
3214         if (!net_eq(net, &init_net)) {
3215                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3216                 if (tbl == NULL)
3217                         goto err_dup;
3218         }
3219         tbl[0].extra1 = net;
3220
3221         net->ipv4.route_hdr =
3222                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3223         if (net->ipv4.route_hdr == NULL)
3224                 goto err_reg;
3225         return 0;
3226
3227 err_reg:
3228         if (tbl != ipv4_route_flush_table)
3229                 kfree(tbl);
3230 err_dup:
3231         return -ENOMEM;
3232 }
3233
3234 static __net_exit void sysctl_route_net_exit(struct net *net)
3235 {
3236         struct ctl_table *tbl;
3237
3238         tbl = net->ipv4.route_hdr->ctl_table_arg;
3239         unregister_net_sysctl_table(net->ipv4.route_hdr);
3240         BUG_ON(tbl == ipv4_route_flush_table);
3241         kfree(tbl);
3242 }
3243
3244 static __net_initdata struct pernet_operations sysctl_route_ops = {
3245         .init = sysctl_route_net_init,
3246         .exit = sysctl_route_net_exit,
3247 };
3248 #endif
3249
3250 static __net_init int rt_genid_init(struct net *net)
3251 {
3252         get_random_bytes(&net->ipv4.rt_genid,
3253                          sizeof(net->ipv4.rt_genid));
3254         return 0;
3255 }
3256
3257 static __net_initdata struct pernet_operations rt_genid_ops = {
3258         .init = rt_genid_init,
3259 };
3260
3261
3262 #ifdef CONFIG_NET_CLS_ROUTE
3263 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3264 #endif /* CONFIG_NET_CLS_ROUTE */
3265
3266 static __initdata unsigned long rhash_entries;
3267 static int __init set_rhash_entries(char *str)
3268 {
3269         if (!str)
3270                 return 0;
3271         rhash_entries = simple_strtoul(str, &str, 0);
3272         return 1;
3273 }
3274 __setup("rhash_entries=", set_rhash_entries);
3275
3276 int __init ip_rt_init(void)
3277 {
3278         int rc = 0;
3279
3280 #ifdef CONFIG_NET_CLS_ROUTE
3281         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3282         if (!ip_rt_acct)
3283                 panic("IP: failed to allocate ip_rt_acct\n");
3284 #endif
3285
3286         ipv4_dst_ops.kmem_cachep =
3287                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3288                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3289
3290         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3291
3292         if (dst_entries_init(&ipv4_dst_ops) < 0)
3293                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3294
3295         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3296                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3297
3298         rt_hash_table = (struct rt_hash_bucket *)
3299                 alloc_large_system_hash("IP route cache",
3300                                         sizeof(struct rt_hash_bucket),
3301                                         rhash_entries,
3302                                         (totalram_pages >= 128 * 1024) ?
3303                                         15 : 17,
3304                                         0,
3305                                         &rt_hash_log,
3306                                         &rt_hash_mask,
3307                                         rhash_entries ? 0 : 512 * 1024);
3308         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3309         rt_hash_lock_init();
3310
3311         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3312         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3313
3314         devinet_init();
3315         ip_fib_init();
3316
3317         /* All the timers, started at system startup tend
3318            to synchronize. Perturb it a bit.
3319          */
3320         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3321         expires_ljiffies = jiffies;
3322         schedule_delayed_work(&expires_work,
3323                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3324
3325         if (ip_rt_proc_init())
3326                 printk(KERN_ERR "Unable to create route proc files\n");
3327 #ifdef CONFIG_XFRM
3328         xfrm_init();
3329         xfrm4_init(ip_rt_max_size);
3330 #endif
3331         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3332
3333 #ifdef CONFIG_SYSCTL
3334         register_pernet_subsys(&sysctl_route_ops);
3335 #endif
3336         register_pernet_subsys(&rt_genid_ops);
3337         return rc;
3338 }
3339
3340 #ifdef CONFIG_SYSCTL
3341 /*
3342  * We really need to sanitize the damn ipv4 init order, then all
3343  * this nonsense will go away.
3344  */
3345 void __init ip_static_sysctl_init(void)
3346 {
3347         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3348 }
3349 #endif