net-next: remove useless union keyword
[linux-flexiantxendom0-natty.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150
151 static struct dst_ops ipv4_dst_ops = {
152         .family =               AF_INET,
153         .protocol =             cpu_to_be16(ETH_P_IP),
154         .gc =                   rt_garbage_collect,
155         .check =                ipv4_dst_check,
156         .destroy =              ipv4_dst_destroy,
157         .ifdown =               ipv4_dst_ifdown,
158         .negative_advice =      ipv4_negative_advice,
159         .link_failure =         ipv4_link_failure,
160         .update_pmtu =          ip_rt_update_pmtu,
161         .local_out =            __ip_local_out,
162         .entries =              ATOMIC_INIT(0),
163 };
164
165 #define ECN_OR_COST(class)      TC_PRIO_##class
166
167 const __u8 ip_tos2prio[16] = {
168         TC_PRIO_BESTEFFORT,
169         ECN_OR_COST(FILLER),
170         TC_PRIO_BESTEFFORT,
171         ECN_OR_COST(BESTEFFORT),
172         TC_PRIO_BULK,
173         ECN_OR_COST(BULK),
174         TC_PRIO_BULK,
175         ECN_OR_COST(BULK),
176         TC_PRIO_INTERACTIVE,
177         ECN_OR_COST(INTERACTIVE),
178         TC_PRIO_INTERACTIVE,
179         ECN_OR_COST(INTERACTIVE),
180         TC_PRIO_INTERACTIVE_BULK,
181         ECN_OR_COST(INTERACTIVE_BULK),
182         TC_PRIO_INTERACTIVE_BULK,
183         ECN_OR_COST(INTERACTIVE_BULK)
184 };
185
186
187 /*
188  * Route cache.
189  */
190
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200
201 struct rt_hash_bucket {
202         struct rtable   *chain;
203 };
204
205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206         defined(CONFIG_PROVE_LOCKING)
207 /*
208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209  * The size of this table is a power of two and depends on the number of CPUS.
210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211  */
212 #ifdef CONFIG_LOCKDEP
213 # define RT_HASH_LOCK_SZ        256
214 #else
215 # if NR_CPUS >= 32
216 #  define RT_HASH_LOCK_SZ       4096
217 # elif NR_CPUS >= 16
218 #  define RT_HASH_LOCK_SZ       2048
219 # elif NR_CPUS >= 8
220 #  define RT_HASH_LOCK_SZ       1024
221 # elif NR_CPUS >= 4
222 #  define RT_HASH_LOCK_SZ       512
223 # else
224 #  define RT_HASH_LOCK_SZ       256
225 # endif
226 #endif
227
228 static spinlock_t       *rt_hash_locks;
229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230
231 static __init void rt_hash_lock_init(void)
232 {
233         int i;
234
235         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236                         GFP_KERNEL);
237         if (!rt_hash_locks)
238                 panic("IP: failed to allocate rt_hash_locks\n");
239
240         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241                 spin_lock_init(&rt_hash_locks[i]);
242 }
243 #else
244 # define rt_hash_lock_addr(slot) NULL
245
246 static inline void rt_hash_lock_init(void)
247 {
248 }
249 #endif
250
251 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
252 static unsigned                 rt_hash_mask __read_mostly;
253 static unsigned int             rt_hash_log  __read_mostly;
254
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
257
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259                                    int genid)
260 {
261         return jhash_3words((__force u32)daddr, (__force u32)saddr,
262                             idx, genid)
263                 & rt_hash_mask;
264 }
265
266 static inline int rt_genid(struct net *net)
267 {
268         return atomic_read(&net->ipv4.rt_genid);
269 }
270
271 #ifdef CONFIG_PROC_FS
272 struct rt_cache_iter_state {
273         struct seq_net_private p;
274         int bucket;
275         int genid;
276 };
277
278 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 {
280         struct rt_cache_iter_state *st = seq->private;
281         struct rtable *r = NULL;
282
283         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284                 if (!rt_hash_table[st->bucket].chain)
285                         continue;
286                 rcu_read_lock_bh();
287                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
288                 while (r) {
289                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
290                             r->rt_genid == st->genid)
291                                 return r;
292                         r = rcu_dereference_bh(r->dst.rt_next);
293                 }
294                 rcu_read_unlock_bh();
295         }
296         return r;
297 }
298
299 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
300                                           struct rtable *r)
301 {
302         struct rt_cache_iter_state *st = seq->private;
303
304         r = r->dst.rt_next;
305         while (!r) {
306                 rcu_read_unlock_bh();
307                 do {
308                         if (--st->bucket < 0)
309                                 return NULL;
310                 } while (!rt_hash_table[st->bucket].chain);
311                 rcu_read_lock_bh();
312                 r = rt_hash_table[st->bucket].chain;
313         }
314         return rcu_dereference_bh(r);
315 }
316
317 static struct rtable *rt_cache_get_next(struct seq_file *seq,
318                                         struct rtable *r)
319 {
320         struct rt_cache_iter_state *st = seq->private;
321         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
322                 if (dev_net(r->dst.dev) != seq_file_net(seq))
323                         continue;
324                 if (r->rt_genid == st->genid)
325                         break;
326         }
327         return r;
328 }
329
330 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
331 {
332         struct rtable *r = rt_cache_get_first(seq);
333
334         if (r)
335                 while (pos && (r = rt_cache_get_next(seq, r)))
336                         --pos;
337         return pos ? NULL : r;
338 }
339
340 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
341 {
342         struct rt_cache_iter_state *st = seq->private;
343         if (*pos)
344                 return rt_cache_get_idx(seq, *pos - 1);
345         st->genid = rt_genid(seq_file_net(seq));
346         return SEQ_START_TOKEN;
347 }
348
349 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
350 {
351         struct rtable *r;
352
353         if (v == SEQ_START_TOKEN)
354                 r = rt_cache_get_first(seq);
355         else
356                 r = rt_cache_get_next(seq, v);
357         ++*pos;
358         return r;
359 }
360
361 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
362 {
363         if (v && v != SEQ_START_TOKEN)
364                 rcu_read_unlock_bh();
365 }
366
367 static int rt_cache_seq_show(struct seq_file *seq, void *v)
368 {
369         if (v == SEQ_START_TOKEN)
370                 seq_printf(seq, "%-127s\n",
371                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
372                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
373                            "HHUptod\tSpecDst");
374         else {
375                 struct rtable *r = v;
376                 int len;
377
378                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
379                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
380                         r->dst.dev ? r->dst.dev->name : "*",
381                         (__force u32)r->rt_dst,
382                         (__force u32)r->rt_gateway,
383                         r->rt_flags, atomic_read(&r->dst.__refcnt),
384                         r->dst.__use, 0, (__force u32)r->rt_src,
385                         (dst_metric(&r->dst, RTAX_ADVMSS) ?
386                              (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
387                         dst_metric(&r->dst, RTAX_WINDOW),
388                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389                               dst_metric(&r->dst, RTAX_RTTVAR)),
390                         r->fl.fl4_tos,
391                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
392                         r->dst.hh ? (r->dst.hh->hh_output ==
393                                        dev_queue_xmit) : 0,
394                         r->rt_spec_dst, &len);
395
396                 seq_printf(seq, "%*s\n", 127 - len, "");
397         }
398         return 0;
399 }
400
401 static const struct seq_operations rt_cache_seq_ops = {
402         .start  = rt_cache_seq_start,
403         .next   = rt_cache_seq_next,
404         .stop   = rt_cache_seq_stop,
405         .show   = rt_cache_seq_show,
406 };
407
408 static int rt_cache_seq_open(struct inode *inode, struct file *file)
409 {
410         return seq_open_net(inode, file, &rt_cache_seq_ops,
411                         sizeof(struct rt_cache_iter_state));
412 }
413
414 static const struct file_operations rt_cache_seq_fops = {
415         .owner   = THIS_MODULE,
416         .open    = rt_cache_seq_open,
417         .read    = seq_read,
418         .llseek  = seq_lseek,
419         .release = seq_release_net,
420 };
421
422
423 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
424 {
425         int cpu;
426
427         if (*pos == 0)
428                 return SEQ_START_TOKEN;
429
430         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
431                 if (!cpu_possible(cpu))
432                         continue;
433                 *pos = cpu+1;
434                 return &per_cpu(rt_cache_stat, cpu);
435         }
436         return NULL;
437 }
438
439 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
440 {
441         int cpu;
442
443         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
444                 if (!cpu_possible(cpu))
445                         continue;
446                 *pos = cpu+1;
447                 return &per_cpu(rt_cache_stat, cpu);
448         }
449         return NULL;
450
451 }
452
453 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
454 {
455
456 }
457
458 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459 {
460         struct rt_cache_stat *st = v;
461
462         if (v == SEQ_START_TOKEN) {
463                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
464                 return 0;
465         }
466
467         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
468                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469                    atomic_read(&ipv4_dst_ops.entries),
470                    st->in_hit,
471                    st->in_slow_tot,
472                    st->in_slow_mc,
473                    st->in_no_route,
474                    st->in_brd,
475                    st->in_martian_dst,
476                    st->in_martian_src,
477
478                    st->out_hit,
479                    st->out_slow_tot,
480                    st->out_slow_mc,
481
482                    st->gc_total,
483                    st->gc_ignored,
484                    st->gc_goal_miss,
485                    st->gc_dst_overflow,
486                    st->in_hlist_search,
487                    st->out_hlist_search
488                 );
489         return 0;
490 }
491
492 static const struct seq_operations rt_cpu_seq_ops = {
493         .start  = rt_cpu_seq_start,
494         .next   = rt_cpu_seq_next,
495         .stop   = rt_cpu_seq_stop,
496         .show   = rt_cpu_seq_show,
497 };
498
499
500 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501 {
502         return seq_open(file, &rt_cpu_seq_ops);
503 }
504
505 static const struct file_operations rt_cpu_seq_fops = {
506         .owner   = THIS_MODULE,
507         .open    = rt_cpu_seq_open,
508         .read    = seq_read,
509         .llseek  = seq_lseek,
510         .release = seq_release,
511 };
512
513 #ifdef CONFIG_NET_CLS_ROUTE
514 static int rt_acct_proc_show(struct seq_file *m, void *v)
515 {
516         struct ip_rt_acct *dst, *src;
517         unsigned int i, j;
518
519         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
520         if (!dst)
521                 return -ENOMEM;
522
523         for_each_possible_cpu(i) {
524                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
525                 for (j = 0; j < 256; j++) {
526                         dst[j].o_bytes   += src[j].o_bytes;
527                         dst[j].o_packets += src[j].o_packets;
528                         dst[j].i_bytes   += src[j].i_bytes;
529                         dst[j].i_packets += src[j].i_packets;
530                 }
531         }
532
533         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
534         kfree(dst);
535         return 0;
536 }
537
538 static int rt_acct_proc_open(struct inode *inode, struct file *file)
539 {
540         return single_open(file, rt_acct_proc_show, NULL);
541 }
542
543 static const struct file_operations rt_acct_proc_fops = {
544         .owner          = THIS_MODULE,
545         .open           = rt_acct_proc_open,
546         .read           = seq_read,
547         .llseek         = seq_lseek,
548         .release        = single_release,
549 };
550 #endif
551
552 static int __net_init ip_rt_do_proc_init(struct net *net)
553 {
554         struct proc_dir_entry *pde;
555
556         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
557                         &rt_cache_seq_fops);
558         if (!pde)
559                 goto err1;
560
561         pde = proc_create("rt_cache", S_IRUGO,
562                           net->proc_net_stat, &rt_cpu_seq_fops);
563         if (!pde)
564                 goto err2;
565
566 #ifdef CONFIG_NET_CLS_ROUTE
567         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
568         if (!pde)
569                 goto err3;
570 #endif
571         return 0;
572
573 #ifdef CONFIG_NET_CLS_ROUTE
574 err3:
575         remove_proc_entry("rt_cache", net->proc_net_stat);
576 #endif
577 err2:
578         remove_proc_entry("rt_cache", net->proc_net);
579 err1:
580         return -ENOMEM;
581 }
582
583 static void __net_exit ip_rt_do_proc_exit(struct net *net)
584 {
585         remove_proc_entry("rt_cache", net->proc_net_stat);
586         remove_proc_entry("rt_cache", net->proc_net);
587 #ifdef CONFIG_NET_CLS_ROUTE
588         remove_proc_entry("rt_acct", net->proc_net);
589 #endif
590 }
591
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593         .init = ip_rt_do_proc_init,
594         .exit = ip_rt_do_proc_exit,
595 };
596
597 static int __init ip_rt_proc_init(void)
598 {
599         return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605         return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608
609 static inline void rt_free(struct rtable *rt)
610 {
611         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
612 }
613
614 static inline void rt_drop(struct rtable *rt)
615 {
616         ip_rt_put(rt);
617         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
618 }
619
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622         /* Kill broadcast/multicast entries very aggresively, if they
623            collide in hash table with more useful entries */
624         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625                 rth->fl.iif && rth->dst.rt_next;
626 }
627
628 static inline int rt_valuable(struct rtable *rth)
629 {
630         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631                 rth->dst.expires;
632 }
633
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636         unsigned long age;
637         int ret = 0;
638
639         if (atomic_read(&rth->dst.__refcnt))
640                 goto out;
641
642         ret = 1;
643         if (rth->dst.expires &&
644             time_after_eq(jiffies, rth->dst.expires))
645                 goto out;
646
647         age = jiffies - rth->dst.lastuse;
648         ret = 0;
649         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650             (age <= tmo2 && rt_valuable(rth)))
651                 goto out;
652         ret = 1;
653 out:    return ret;
654 }
655
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663         u32 score = jiffies - rt->dst.lastuse;
664
665         score = ~score & ~(3<<30);
666
667         if (rt_valuable(rt))
668                 score |= (1<<31);
669
670         if (!rt->fl.iif ||
671             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672                 score |= (1<<30);
673
674         return score;
675 }
676
677 static inline bool rt_caching(const struct net *net)
678 {
679         return net->ipv4.current_rt_cache_rebuild_count <=
680                 net->ipv4.sysctl_rt_cache_rebuild_count;
681 }
682
683 static inline bool compare_hash_inputs(const struct flowi *fl1,
684                                         const struct flowi *fl2)
685 {
686         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
687                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
688                 (fl1->iif ^ fl2->iif)) == 0);
689 }
690
691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692 {
693         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
694                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
695                 (fl1->mark ^ fl2->mark) |
696                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
697                 (fl1->oif ^ fl2->oif) |
698                 (fl1->iif ^ fl2->iif)) == 0;
699 }
700
701 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
702 {
703         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
704 }
705
706 static inline int rt_is_expired(struct rtable *rth)
707 {
708         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
709 }
710
711 /*
712  * Perform a full scan of hash table and free all entries.
713  * Can be called by a softirq or a process.
714  * In the later case, we want to be reschedule if necessary
715  */
716 static void rt_do_flush(int process_context)
717 {
718         unsigned int i;
719         struct rtable *rth, *next;
720         struct rtable * tail;
721
722         for (i = 0; i <= rt_hash_mask; i++) {
723                 if (process_context && need_resched())
724                         cond_resched();
725                 rth = rt_hash_table[i].chain;
726                 if (!rth)
727                         continue;
728
729                 spin_lock_bh(rt_hash_lock_addr(i));
730 #ifdef CONFIG_NET_NS
731                 {
732                 struct rtable ** prev, * p;
733
734                 rth = rt_hash_table[i].chain;
735
736                 /* defer releasing the head of the list after spin_unlock */
737                 for (tail = rth; tail; tail = tail->dst.rt_next)
738                         if (!rt_is_expired(tail))
739                                 break;
740                 if (rth != tail)
741                         rt_hash_table[i].chain = tail;
742
743                 /* call rt_free on entries after the tail requiring flush */
744                 prev = &rt_hash_table[i].chain;
745                 for (p = *prev; p; p = next) {
746                         next = p->dst.rt_next;
747                         if (!rt_is_expired(p)) {
748                                 prev = &p->dst.rt_next;
749                         } else {
750                                 *prev = next;
751                                 rt_free(p);
752                         }
753                 }
754                 }
755 #else
756                 rth = rt_hash_table[i].chain;
757                 rt_hash_table[i].chain = NULL;
758                 tail = NULL;
759 #endif
760                 spin_unlock_bh(rt_hash_lock_addr(i));
761
762                 for (; rth != tail; rth = next) {
763                         next = rth->dst.rt_next;
764                         rt_free(rth);
765                 }
766         }
767 }
768
769 /*
770  * While freeing expired entries, we compute average chain length
771  * and standard deviation, using fixed-point arithmetic.
772  * This to have an estimation of rt_chain_length_max
773  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
774  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
775  */
776
777 #define FRACT_BITS 3
778 #define ONE (1UL << FRACT_BITS)
779
780 /*
781  * Given a hash chain and an item in this hash chain,
782  * find if a previous entry has the same hash_inputs
783  * (but differs on tos, mark or oif)
784  * Returns 0 if an alias is found.
785  * Returns ONE if rth has no alias before itself.
786  */
787 static int has_noalias(const struct rtable *head, const struct rtable *rth)
788 {
789         const struct rtable *aux = head;
790
791         while (aux != rth) {
792                 if (compare_hash_inputs(&aux->fl, &rth->fl))
793                         return 0;
794                 aux = aux->dst.rt_next;
795         }
796         return ONE;
797 }
798
799 static void rt_check_expire(void)
800 {
801         static unsigned int rover;
802         unsigned int i = rover, goal;
803         struct rtable *rth, **rthp;
804         unsigned long samples = 0;
805         unsigned long sum = 0, sum2 = 0;
806         unsigned long delta;
807         u64 mult;
808
809         delta = jiffies - expires_ljiffies;
810         expires_ljiffies = jiffies;
811         mult = ((u64)delta) << rt_hash_log;
812         if (ip_rt_gc_timeout > 1)
813                 do_div(mult, ip_rt_gc_timeout);
814         goal = (unsigned int)mult;
815         if (goal > rt_hash_mask)
816                 goal = rt_hash_mask + 1;
817         for (; goal > 0; goal--) {
818                 unsigned long tmo = ip_rt_gc_timeout;
819                 unsigned long length;
820
821                 i = (i + 1) & rt_hash_mask;
822                 rthp = &rt_hash_table[i].chain;
823
824                 if (need_resched())
825                         cond_resched();
826
827                 samples++;
828
829                 if (*rthp == NULL)
830                         continue;
831                 length = 0;
832                 spin_lock_bh(rt_hash_lock_addr(i));
833                 while ((rth = *rthp) != NULL) {
834                         prefetch(rth->dst.rt_next);
835                         if (rt_is_expired(rth)) {
836                                 *rthp = rth->dst.rt_next;
837                                 rt_free(rth);
838                                 continue;
839                         }
840                         if (rth->dst.expires) {
841                                 /* Entry is expired even if it is in use */
842                                 if (time_before_eq(jiffies, rth->dst.expires)) {
843 nofree:
844                                         tmo >>= 1;
845                                         rthp = &rth->dst.rt_next;
846                                         /*
847                                          * We only count entries on
848                                          * a chain with equal hash inputs once
849                                          * so that entries for different QOS
850                                          * levels, and other non-hash input
851                                          * attributes don't unfairly skew
852                                          * the length computation
853                                          */
854                                         length += has_noalias(rt_hash_table[i].chain, rth);
855                                         continue;
856                                 }
857                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
858                                 goto nofree;
859
860                         /* Cleanup aged off entries. */
861                         *rthp = rth->dst.rt_next;
862                         rt_free(rth);
863                 }
864                 spin_unlock_bh(rt_hash_lock_addr(i));
865                 sum += length;
866                 sum2 += length*length;
867         }
868         if (samples) {
869                 unsigned long avg = sum / samples;
870                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
871                 rt_chain_length_max = max_t(unsigned long,
872                                         ip_rt_gc_elasticity,
873                                         (avg + 4*sd) >> FRACT_BITS);
874         }
875         rover = i;
876 }
877
878 /*
879  * rt_worker_func() is run in process context.
880  * we call rt_check_expire() to scan part of the hash table
881  */
882 static void rt_worker_func(struct work_struct *work)
883 {
884         rt_check_expire();
885         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
886 }
887
888 /*
889  * Pertubation of rt_genid by a small quantity [1..256]
890  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
891  * many times (2^24) without giving recent rt_genid.
892  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
893  */
894 static void rt_cache_invalidate(struct net *net)
895 {
896         unsigned char shuffle;
897
898         get_random_bytes(&shuffle, sizeof(shuffle));
899         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
900 }
901
902 /*
903  * delay < 0  : invalidate cache (fast : entries will be deleted later)
904  * delay >= 0 : invalidate & flush cache (can be long)
905  */
906 void rt_cache_flush(struct net *net, int delay)
907 {
908         rt_cache_invalidate(net);
909         if (delay >= 0)
910                 rt_do_flush(!in_softirq());
911 }
912
913 /* Flush previous cache invalidated entries from the cache */
914 void rt_cache_flush_batch(void)
915 {
916         rt_do_flush(!in_softirq());
917 }
918
919 static void rt_emergency_hash_rebuild(struct net *net)
920 {
921         if (net_ratelimit())
922                 printk(KERN_WARNING "Route hash chain too long!\n");
923         rt_cache_invalidate(net);
924 }
925
926 /*
927    Short description of GC goals.
928
929    We want to build algorithm, which will keep routing cache
930    at some equilibrium point, when number of aged off entries
931    is kept approximately equal to newly generated ones.
932
933    Current expiration strength is variable "expire".
934    We try to adjust it dynamically, so that if networking
935    is idle expires is large enough to keep enough of warm entries,
936    and when load increases it reduces to limit cache size.
937  */
938
939 static int rt_garbage_collect(struct dst_ops *ops)
940 {
941         static unsigned long expire = RT_GC_TIMEOUT;
942         static unsigned long last_gc;
943         static int rover;
944         static int equilibrium;
945         struct rtable *rth, **rthp;
946         unsigned long now = jiffies;
947         int goal;
948
949         /*
950          * Garbage collection is pretty expensive,
951          * do not make it too frequently.
952          */
953
954         RT_CACHE_STAT_INC(gc_total);
955
956         if (now - last_gc < ip_rt_gc_min_interval &&
957             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
958                 RT_CACHE_STAT_INC(gc_ignored);
959                 goto out;
960         }
961
962         /* Calculate number of entries, which we want to expire now. */
963         goal = atomic_read(&ipv4_dst_ops.entries) -
964                 (ip_rt_gc_elasticity << rt_hash_log);
965         if (goal <= 0) {
966                 if (equilibrium < ipv4_dst_ops.gc_thresh)
967                         equilibrium = ipv4_dst_ops.gc_thresh;
968                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
969                 if (goal > 0) {
970                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
972                 }
973         } else {
974                 /* We are in dangerous area. Try to reduce cache really
975                  * aggressively.
976                  */
977                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
979         }
980
981         if (now - last_gc >= ip_rt_gc_min_interval)
982                 last_gc = now;
983
984         if (goal <= 0) {
985                 equilibrium += goal;
986                 goto work_done;
987         }
988
989         do {
990                 int i, k;
991
992                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993                         unsigned long tmo = expire;
994
995                         k = (k + 1) & rt_hash_mask;
996                         rthp = &rt_hash_table[k].chain;
997                         spin_lock_bh(rt_hash_lock_addr(k));
998                         while ((rth = *rthp) != NULL) {
999                                 if (!rt_is_expired(rth) &&
1000                                         !rt_may_expire(rth, tmo, expire)) {
1001                                         tmo >>= 1;
1002                                         rthp = &rth->dst.rt_next;
1003                                         continue;
1004                                 }
1005                                 *rthp = rth->dst.rt_next;
1006                                 rt_free(rth);
1007                                 goal--;
1008                         }
1009                         spin_unlock_bh(rt_hash_lock_addr(k));
1010                         if (goal <= 0)
1011                                 break;
1012                 }
1013                 rover = k;
1014
1015                 if (goal <= 0)
1016                         goto work_done;
1017
1018                 /* Goal is not achieved. We stop process if:
1019
1020                    - if expire reduced to zero. Otherwise, expire is halfed.
1021                    - if table is not full.
1022                    - if we are called from interrupt.
1023                    - jiffies check is just fallback/debug loop breaker.
1024                      We will not spin here for long time in any case.
1025                  */
1026
1027                 RT_CACHE_STAT_INC(gc_goal_miss);
1028
1029                 if (expire == 0)
1030                         break;
1031
1032                 expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1036 #endif
1037
1038                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1039                         goto out;
1040         } while (!in_softirq() && time_before_eq(jiffies, now));
1041
1042         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1043                 goto out;
1044         if (net_ratelimit())
1045                 printk(KERN_WARNING "dst cache overflow\n");
1046         RT_CACHE_STAT_INC(gc_dst_overflow);
1047         return 1;
1048
1049 work_done:
1050         expire += ip_rt_gc_min_interval;
1051         if (expire > ip_rt_gc_timeout ||
1052             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1053                 expire = ip_rt_gc_timeout;
1054 #if RT_CACHE_DEBUG >= 2
1055         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1057 #endif
1058 out:    return 0;
1059 }
1060
1061 /*
1062  * Returns number of entries in a hash chain that have different hash_inputs
1063  */
1064 static int slow_chain_length(const struct rtable *head)
1065 {
1066         int length = 0;
1067         const struct rtable *rth = head;
1068
1069         while (rth) {
1070                 length += has_noalias(head, rth);
1071                 rth = rth->dst.rt_next;
1072         }
1073         return length >> FRACT_BITS;
1074 }
1075
1076 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1078 {
1079         struct rtable   *rth, **rthp;
1080         unsigned long   now;
1081         struct rtable *cand, **candp;
1082         u32             min_score;
1083         int             chain_length;
1084         int attempts = !in_softirq();
1085
1086 restart:
1087         chain_length = 0;
1088         min_score = ~(u32)0;
1089         cand = NULL;
1090         candp = NULL;
1091         now = jiffies;
1092
1093         if (!rt_caching(dev_net(rt->dst.dev))) {
1094                 /*
1095                  * If we're not caching, just tell the caller we
1096                  * were successful and don't touch the route.  The
1097                  * caller hold the sole reference to the cache entry, and
1098                  * it will be released when the caller is done with it.
1099                  * If we drop it here, the callers have no way to resolve routes
1100                  * when we're not caching.  Instead, just point *rp at rt, so
1101                  * the caller gets a single use out of the route
1102                  * Note that we do rt_free on this new route entry, so that
1103                  * once its refcount hits zero, we are still able to reap it
1104                  * (Thanks Alexey)
1105                  * Note also the rt_free uses call_rcu.  We don't actually
1106                  * need rcu protection here, this is just our path to get
1107                  * on the route gc list.
1108                  */
1109
1110                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111                         int err = arp_bind_neighbour(&rt->dst);
1112                         if (err) {
1113                                 if (net_ratelimit())
1114                                         printk(KERN_WARNING
1115                                             "Neighbour table failure & not caching routes.\n");
1116                                 rt_drop(rt);
1117                                 return err;
1118                         }
1119                 }
1120
1121                 rt_free(rt);
1122                 goto skip_hashing;
1123         }
1124
1125         rthp = &rt_hash_table[hash].chain;
1126
1127         spin_lock_bh(rt_hash_lock_addr(hash));
1128         while ((rth = *rthp) != NULL) {
1129                 if (rt_is_expired(rth)) {
1130                         *rthp = rth->dst.rt_next;
1131                         rt_free(rth);
1132                         continue;
1133                 }
1134                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135                         /* Put it first */
1136                         *rthp = rth->dst.rt_next;
1137                         /*
1138                          * Since lookup is lockfree, the deletion
1139                          * must be visible to another weakly ordered CPU before
1140                          * the insertion at the start of the hash chain.
1141                          */
1142                         rcu_assign_pointer(rth->dst.rt_next,
1143                                            rt_hash_table[hash].chain);
1144                         /*
1145                          * Since lookup is lockfree, the update writes
1146                          * must be ordered for consistency on SMP.
1147                          */
1148                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
1150                         dst_use(&rth->dst, now);
1151                         spin_unlock_bh(rt_hash_lock_addr(hash));
1152
1153                         rt_drop(rt);
1154                         if (rp)
1155                                 *rp = rth;
1156                         else
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return 0;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         /* Try to bind route to arp only if it is output
1206            route or unicast forwarding path.
1207          */
1208         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1209                 int err = arp_bind_neighbour(&rt->dst);
1210                 if (err) {
1211                         spin_unlock_bh(rt_hash_lock_addr(hash));
1212
1213                         if (err != -ENOBUFS) {
1214                                 rt_drop(rt);
1215                                 return err;
1216                         }
1217
1218                         /* Neighbour tables are full and nothing
1219                            can be released. Try to shrink route cache,
1220                            it is most likely it holds some neighbour records.
1221                          */
1222                         if (attempts-- > 0) {
1223                                 int saved_elasticity = ip_rt_gc_elasticity;
1224                                 int saved_int = ip_rt_gc_min_interval;
1225                                 ip_rt_gc_elasticity     = 1;
1226                                 ip_rt_gc_min_interval   = 0;
1227                                 rt_garbage_collect(&ipv4_dst_ops);
1228                                 ip_rt_gc_min_interval   = saved_int;
1229                                 ip_rt_gc_elasticity     = saved_elasticity;
1230                                 goto restart;
1231                         }
1232
1233                         if (net_ratelimit())
1234                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1235                         rt_drop(rt);
1236                         return -ENOBUFS;
1237                 }
1238         }
1239
1240         rt->dst.rt_next = rt_hash_table[hash].chain;
1241
1242 #if RT_CACHE_DEBUG >= 2
1243         if (rt->dst.rt_next) {
1244                 struct rtable *trt;
1245                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1246                        hash, &rt->rt_dst);
1247                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1248                         printk(" . %pI4", &trt->rt_dst);
1249                 printk("\n");
1250         }
1251 #endif
1252         /*
1253          * Since lookup is lockfree, we must make sure
1254          * previous writes to rt are comitted to memory
1255          * before making rt visible to other CPUS.
1256          */
1257         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1258
1259         spin_unlock_bh(rt_hash_lock_addr(hash));
1260
1261 skip_hashing:
1262         if (rp)
1263                 *rp = rt;
1264         else
1265                 skb_dst_set(skb, &rt->dst);
1266         return 0;
1267 }
1268
1269 void rt_bind_peer(struct rtable *rt, int create)
1270 {
1271         static DEFINE_SPINLOCK(rt_peer_lock);
1272         struct inet_peer *peer;
1273
1274         peer = inet_getpeer(rt->rt_dst, create);
1275
1276         spin_lock_bh(&rt_peer_lock);
1277         if (rt->peer == NULL) {
1278                 rt->peer = peer;
1279                 peer = NULL;
1280         }
1281         spin_unlock_bh(&rt_peer_lock);
1282         if (peer)
1283                 inet_putpeer(peer);
1284 }
1285
1286 /*
1287  * Peer allocation may fail only in serious out-of-memory conditions.  However
1288  * we still can generate some output.
1289  * Random ID selection looks a bit dangerous because we have no chances to
1290  * select ID being unique in a reasonable period of time.
1291  * But broken packet identifier may be better than no packet at all.
1292  */
1293 static void ip_select_fb_ident(struct iphdr *iph)
1294 {
1295         static DEFINE_SPINLOCK(ip_fb_id_lock);
1296         static u32 ip_fallback_id;
1297         u32 salt;
1298
1299         spin_lock_bh(&ip_fb_id_lock);
1300         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1301         iph->id = htons(salt & 0xFFFF);
1302         ip_fallback_id = salt;
1303         spin_unlock_bh(&ip_fb_id_lock);
1304 }
1305
1306 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1307 {
1308         struct rtable *rt = (struct rtable *) dst;
1309
1310         if (rt) {
1311                 if (rt->peer == NULL)
1312                         rt_bind_peer(rt, 1);
1313
1314                 /* If peer is attached to destination, it is never detached,
1315                    so that we need not to grab a lock to dereference it.
1316                  */
1317                 if (rt->peer) {
1318                         iph->id = htons(inet_getid(rt->peer, more));
1319                         return;
1320                 }
1321         } else
1322                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1323                        __builtin_return_address(0));
1324
1325         ip_select_fb_ident(iph);
1326 }
1327
1328 static void rt_del(unsigned hash, struct rtable *rt)
1329 {
1330         struct rtable **rthp, *aux;
1331
1332         rthp = &rt_hash_table[hash].chain;
1333         spin_lock_bh(rt_hash_lock_addr(hash));
1334         ip_rt_put(rt);
1335         while ((aux = *rthp) != NULL) {
1336                 if (aux == rt || rt_is_expired(aux)) {
1337                         *rthp = aux->dst.rt_next;
1338                         rt_free(aux);
1339                         continue;
1340                 }
1341                 rthp = &aux->dst.rt_next;
1342         }
1343         spin_unlock_bh(rt_hash_lock_addr(hash));
1344 }
1345
1346 /* called in rcu_read_lock() section */
1347 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1348                     __be32 saddr, struct net_device *dev)
1349 {
1350         int i, k;
1351         struct in_device *in_dev = __in_dev_get_rcu(dev);
1352         struct rtable *rth, **rthp;
1353         __be32  skeys[2] = { saddr, 0 };
1354         int  ikeys[2] = { dev->ifindex, 0 };
1355         struct netevent_redirect netevent;
1356         struct net *net;
1357
1358         if (!in_dev)
1359                 return;
1360
1361         net = dev_net(dev);
1362         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1363             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1364             ipv4_is_zeronet(new_gw))
1365                 goto reject_redirect;
1366
1367         if (!rt_caching(net))
1368                 goto reject_redirect;
1369
1370         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1371                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1372                         goto reject_redirect;
1373                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1374                         goto reject_redirect;
1375         } else {
1376                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1377                         goto reject_redirect;
1378         }
1379
1380         for (i = 0; i < 2; i++) {
1381                 for (k = 0; k < 2; k++) {
1382                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1383                                                 rt_genid(net));
1384
1385                         rthp=&rt_hash_table[hash].chain;
1386
1387                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1388                                 struct rtable *rt;
1389
1390                                 if (rth->fl.fl4_dst != daddr ||
1391                                     rth->fl.fl4_src != skeys[i] ||
1392                                     rth->fl.oif != ikeys[k] ||
1393                                     rth->fl.iif != 0 ||
1394                                     rt_is_expired(rth) ||
1395                                     !net_eq(dev_net(rth->dst.dev), net)) {
1396                                         rthp = &rth->dst.rt_next;
1397                                         continue;
1398                                 }
1399
1400                                 if (rth->rt_dst != daddr ||
1401                                     rth->rt_src != saddr ||
1402                                     rth->dst.error ||
1403                                     rth->rt_gateway != old_gw ||
1404                                     rth->dst.dev != dev)
1405                                         break;
1406
1407                                 dst_hold(&rth->dst);
1408
1409                                 rt = dst_alloc(&ipv4_dst_ops);
1410                                 if (rt == NULL) {
1411                                         ip_rt_put(rth);
1412                                         return;
1413                                 }
1414
1415                                 /* Copy all the information. */
1416                                 *rt = *rth;
1417                                 rt->dst.__use           = 1;
1418                                 atomic_set(&rt->dst.__refcnt, 1);
1419                                 rt->dst.child           = NULL;
1420                                 if (rt->dst.dev)
1421                                         dev_hold(rt->dst.dev);
1422                                 if (rt->idev)
1423                                         in_dev_hold(rt->idev);
1424                                 rt->dst.obsolete        = -1;
1425                                 rt->dst.lastuse = jiffies;
1426                                 rt->dst.path            = &rt->dst;
1427                                 rt->dst.neighbour       = NULL;
1428                                 rt->dst.hh              = NULL;
1429 #ifdef CONFIG_XFRM
1430                                 rt->dst.xfrm            = NULL;
1431 #endif
1432                                 rt->rt_genid            = rt_genid(net);
1433                                 rt->rt_flags            |= RTCF_REDIRECTED;
1434
1435                                 /* Gateway is different ... */
1436                                 rt->rt_gateway          = new_gw;
1437
1438                                 /* Redirect received -> path was valid */
1439                                 dst_confirm(&rth->dst);
1440
1441                                 if (rt->peer)
1442                                         atomic_inc(&rt->peer->refcnt);
1443
1444                                 if (arp_bind_neighbour(&rt->dst) ||
1445                                     !(rt->dst.neighbour->nud_state &
1446                                             NUD_VALID)) {
1447                                         if (rt->dst.neighbour)
1448                                                 neigh_event_send(rt->dst.neighbour, NULL);
1449                                         ip_rt_put(rth);
1450                                         rt_drop(rt);
1451                                         goto do_next;
1452                                 }
1453
1454                                 netevent.old = &rth->dst;
1455                                 netevent.new = &rt->dst;
1456                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1457                                                         &netevent);
1458
1459                                 rt_del(hash, rth);
1460                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1461                                         ip_rt_put(rt);
1462                                 goto do_next;
1463                         }
1464                 do_next:
1465                         ;
1466                 }
1467         }
1468         return;
1469
1470 reject_redirect:
1471 #ifdef CONFIG_IP_ROUTE_VERBOSE
1472         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1473                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1474                         "  Advised path = %pI4 -> %pI4\n",
1475                        &old_gw, dev->name, &new_gw,
1476                        &saddr, &daddr);
1477 #endif
1478         ;
1479 }
1480
1481 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1482 {
1483         struct rtable *rt = (struct rtable *)dst;
1484         struct dst_entry *ret = dst;
1485
1486         if (rt) {
1487                 if (dst->obsolete > 0) {
1488                         ip_rt_put(rt);
1489                         ret = NULL;
1490                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1491                            (rt->dst.expires &&
1492                             time_after_eq(jiffies, rt->dst.expires))) {
1493                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1494                                                 rt->fl.oif,
1495                                                 rt_genid(dev_net(dst->dev)));
1496 #if RT_CACHE_DEBUG >= 1
1497                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1498                                 &rt->rt_dst, rt->fl.fl4_tos);
1499 #endif
1500                         rt_del(hash, rt);
1501                         ret = NULL;
1502                 }
1503         }
1504         return ret;
1505 }
1506
1507 /*
1508  * Algorithm:
1509  *      1. The first ip_rt_redirect_number redirects are sent
1510  *         with exponential backoff, then we stop sending them at all,
1511  *         assuming that the host ignores our redirects.
1512  *      2. If we did not see packets requiring redirects
1513  *         during ip_rt_redirect_silence, we assume that the host
1514  *         forgot redirected route and start to send redirects again.
1515  *
1516  * This algorithm is much cheaper and more intelligent than dumb load limiting
1517  * in icmp.c.
1518  *
1519  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1520  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1521  */
1522
1523 void ip_rt_send_redirect(struct sk_buff *skb)
1524 {
1525         struct rtable *rt = skb_rtable(skb);
1526         struct in_device *in_dev;
1527         int log_martians;
1528
1529         rcu_read_lock();
1530         in_dev = __in_dev_get_rcu(rt->dst.dev);
1531         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1532                 rcu_read_unlock();
1533                 return;
1534         }
1535         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1536         rcu_read_unlock();
1537
1538         /* No redirected packets during ip_rt_redirect_silence;
1539          * reset the algorithm.
1540          */
1541         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1542                 rt->dst.rate_tokens = 0;
1543
1544         /* Too many ignored redirects; do not send anything
1545          * set dst.rate_last to the last seen redirected packet.
1546          */
1547         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1548                 rt->dst.rate_last = jiffies;
1549                 return;
1550         }
1551
1552         /* Check for load limit; set rate_last to the latest sent
1553          * redirect.
1554          */
1555         if (rt->dst.rate_tokens == 0 ||
1556             time_after(jiffies,
1557                        (rt->dst.rate_last +
1558                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1559                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1560                 rt->dst.rate_last = jiffies;
1561                 ++rt->dst.rate_tokens;
1562 #ifdef CONFIG_IP_ROUTE_VERBOSE
1563                 if (log_martians &&
1564                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1565                     net_ratelimit())
1566                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1567                                 &rt->rt_src, rt->rt_iif,
1568                                 &rt->rt_dst, &rt->rt_gateway);
1569 #endif
1570         }
1571 }
1572
1573 static int ip_error(struct sk_buff *skb)
1574 {
1575         struct rtable *rt = skb_rtable(skb);
1576         unsigned long now;
1577         int code;
1578
1579         switch (rt->dst.error) {
1580                 case EINVAL:
1581                 default:
1582                         goto out;
1583                 case EHOSTUNREACH:
1584                         code = ICMP_HOST_UNREACH;
1585                         break;
1586                 case ENETUNREACH:
1587                         code = ICMP_NET_UNREACH;
1588                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1589                                         IPSTATS_MIB_INNOROUTES);
1590                         break;
1591                 case EACCES:
1592                         code = ICMP_PKT_FILTERED;
1593                         break;
1594         }
1595
1596         now = jiffies;
1597         rt->dst.rate_tokens += now - rt->dst.rate_last;
1598         if (rt->dst.rate_tokens > ip_rt_error_burst)
1599                 rt->dst.rate_tokens = ip_rt_error_burst;
1600         rt->dst.rate_last = now;
1601         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1602                 rt->dst.rate_tokens -= ip_rt_error_cost;
1603                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1604         }
1605
1606 out:    kfree_skb(skb);
1607         return 0;
1608 }
1609
1610 /*
1611  *      The last two values are not from the RFC but
1612  *      are needed for AMPRnet AX.25 paths.
1613  */
1614
1615 static const unsigned short mtu_plateau[] =
1616 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1617
1618 static inline unsigned short guess_mtu(unsigned short old_mtu)
1619 {
1620         int i;
1621
1622         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1623                 if (old_mtu > mtu_plateau[i])
1624                         return mtu_plateau[i];
1625         return 68;
1626 }
1627
1628 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1629                                  unsigned short new_mtu,
1630                                  struct net_device *dev)
1631 {
1632         int i, k;
1633         unsigned short old_mtu = ntohs(iph->tot_len);
1634         struct rtable *rth;
1635         int  ikeys[2] = { dev->ifindex, 0 };
1636         __be32  skeys[2] = { iph->saddr, 0, };
1637         __be32  daddr = iph->daddr;
1638         unsigned short est_mtu = 0;
1639
1640         for (k = 0; k < 2; k++) {
1641                 for (i = 0; i < 2; i++) {
1642                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1643                                                 rt_genid(net));
1644
1645                         rcu_read_lock();
1646                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1647                              rth = rcu_dereference(rth->dst.rt_next)) {
1648                                 unsigned short mtu = new_mtu;
1649
1650                                 if (rth->fl.fl4_dst != daddr ||
1651                                     rth->fl.fl4_src != skeys[i] ||
1652                                     rth->rt_dst != daddr ||
1653                                     rth->rt_src != iph->saddr ||
1654                                     rth->fl.oif != ikeys[k] ||
1655                                     rth->fl.iif != 0 ||
1656                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1657                                     !net_eq(dev_net(rth->dst.dev), net) ||
1658                                     rt_is_expired(rth))
1659                                         continue;
1660
1661                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1662
1663                                         /* BSD 4.2 compatibility hack :-( */
1664                                         if (mtu == 0 &&
1665                                             old_mtu >= dst_mtu(&rth->dst) &&
1666                                             old_mtu >= 68 + (iph->ihl << 2))
1667                                                 old_mtu -= iph->ihl << 2;
1668
1669                                         mtu = guess_mtu(old_mtu);
1670                                 }
1671                                 if (mtu <= dst_mtu(&rth->dst)) {
1672                                         if (mtu < dst_mtu(&rth->dst)) {
1673                                                 dst_confirm(&rth->dst);
1674                                                 if (mtu < ip_rt_min_pmtu) {
1675                                                         mtu = ip_rt_min_pmtu;
1676                                                         rth->dst.metrics[RTAX_LOCK-1] |=
1677                                                                 (1 << RTAX_MTU);
1678                                                 }
1679                                                 rth->dst.metrics[RTAX_MTU-1] = mtu;
1680                                                 dst_set_expires(&rth->dst,
1681                                                         ip_rt_mtu_expires);
1682                                         }
1683                                         est_mtu = mtu;
1684                                 }
1685                         }
1686                         rcu_read_unlock();
1687                 }
1688         }
1689         return est_mtu ? : new_mtu;
1690 }
1691
1692 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1693 {
1694         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1695             !(dst_metric_locked(dst, RTAX_MTU))) {
1696                 if (mtu < ip_rt_min_pmtu) {
1697                         mtu = ip_rt_min_pmtu;
1698                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1699                 }
1700                 dst->metrics[RTAX_MTU-1] = mtu;
1701                 dst_set_expires(dst, ip_rt_mtu_expires);
1702                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1703         }
1704 }
1705
1706 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1707 {
1708         if (rt_is_expired((struct rtable *)dst))
1709                 return NULL;
1710         return dst;
1711 }
1712
1713 static void ipv4_dst_destroy(struct dst_entry *dst)
1714 {
1715         struct rtable *rt = (struct rtable *) dst;
1716         struct inet_peer *peer = rt->peer;
1717         struct in_device *idev = rt->idev;
1718
1719         if (peer) {
1720                 rt->peer = NULL;
1721                 inet_putpeer(peer);
1722         }
1723
1724         if (idev) {
1725                 rt->idev = NULL;
1726                 in_dev_put(idev);
1727         }
1728 }
1729
1730 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1731                             int how)
1732 {
1733         struct rtable *rt = (struct rtable *) dst;
1734         struct in_device *idev = rt->idev;
1735         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1736                 struct in_device *loopback_idev =
1737                         in_dev_get(dev_net(dev)->loopback_dev);
1738                 if (loopback_idev) {
1739                         rt->idev = loopback_idev;
1740                         in_dev_put(idev);
1741                 }
1742         }
1743 }
1744
1745 static void ipv4_link_failure(struct sk_buff *skb)
1746 {
1747         struct rtable *rt;
1748
1749         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1750
1751         rt = skb_rtable(skb);
1752         if (rt)
1753                 dst_set_expires(&rt->dst, 0);
1754 }
1755
1756 static int ip_rt_bug(struct sk_buff *skb)
1757 {
1758         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1759                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1760                 skb->dev ? skb->dev->name : "?");
1761         kfree_skb(skb);
1762         return 0;
1763 }
1764
1765 /*
1766    We do not cache source address of outgoing interface,
1767    because it is used only by IP RR, TS and SRR options,
1768    so that it out of fast path.
1769
1770    BTW remember: "addr" is allowed to be not aligned
1771    in IP options!
1772  */
1773
1774 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 {
1776         __be32 src;
1777         struct fib_result res;
1778
1779         if (rt->fl.iif == 0)
1780                 src = rt->rt_src;
1781         else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
1782                 src = FIB_RES_PREFSRC(res);
1783                 fib_res_put(&res);
1784         } else
1785                 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1786                                         RT_SCOPE_UNIVERSE);
1787         memcpy(addr, &src, 4);
1788 }
1789
1790 #ifdef CONFIG_NET_CLS_ROUTE
1791 static void set_class_tag(struct rtable *rt, u32 tag)
1792 {
1793         if (!(rt->dst.tclassid & 0xFFFF))
1794                 rt->dst.tclassid |= tag & 0xFFFF;
1795         if (!(rt->dst.tclassid & 0xFFFF0000))
1796                 rt->dst.tclassid |= tag & 0xFFFF0000;
1797 }
1798 #endif
1799
1800 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1801 {
1802         struct fib_info *fi = res->fi;
1803
1804         if (fi) {
1805                 if (FIB_RES_GW(*res) &&
1806                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807                         rt->rt_gateway = FIB_RES_GW(*res);
1808                 memcpy(rt->dst.metrics, fi->fib_metrics,
1809                        sizeof(rt->dst.metrics));
1810                 if (fi->fib_mtu == 0) {
1811                         rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1812                         if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1813                             rt->rt_gateway != rt->rt_dst &&
1814                             rt->dst.dev->mtu > 576)
1815                                 rt->dst.metrics[RTAX_MTU-1] = 576;
1816                 }
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1819 #endif
1820         } else
1821                 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1822
1823         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1824                 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1825         if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1826                 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1827         if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1828                 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1829                                        ip_rt_min_advmss);
1830         if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1831                 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1832
1833 #ifdef CONFIG_NET_CLS_ROUTE
1834 #ifdef CONFIG_IP_MULTIPLE_TABLES
1835         set_class_tag(rt, fib_rules_tclass(res));
1836 #endif
1837         set_class_tag(rt, itag);
1838 #endif
1839         rt->rt_type = res->type;
1840 }
1841
1842 /* called in rcu_read_lock() section */
1843 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1844                                 u8 tos, struct net_device *dev, int our)
1845 {
1846         unsigned int hash;
1847         struct rtable *rth;
1848         __be32 spec_dst;
1849         struct in_device *in_dev = __in_dev_get_rcu(dev);
1850         u32 itag = 0;
1851         int err;
1852
1853         /* Primary sanity checks. */
1854
1855         if (in_dev == NULL)
1856                 return -EINVAL;
1857
1858         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1859             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1860                 goto e_inval;
1861
1862         if (ipv4_is_zeronet(saddr)) {
1863                 if (!ipv4_is_local_multicast(daddr))
1864                         goto e_inval;
1865                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1866         } else {
1867                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1868                                           &itag, 0);
1869                 if (err < 0)
1870                         goto e_err;
1871         }
1872         rth = dst_alloc(&ipv4_dst_ops);
1873         if (!rth)
1874                 goto e_nobufs;
1875
1876         rth->dst.output = ip_rt_bug;
1877         rth->dst.obsolete = -1;
1878
1879         atomic_set(&rth->dst.__refcnt, 1);
1880         rth->dst.flags= DST_HOST;
1881         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1882                 rth->dst.flags |= DST_NOPOLICY;
1883         rth->fl.fl4_dst = daddr;
1884         rth->rt_dst     = daddr;
1885         rth->fl.fl4_tos = tos;
1886         rth->fl.mark    = skb->mark;
1887         rth->fl.fl4_src = saddr;
1888         rth->rt_src     = saddr;
1889 #ifdef CONFIG_NET_CLS_ROUTE
1890         rth->dst.tclassid = itag;
1891 #endif
1892         rth->rt_iif     =
1893         rth->fl.iif     = dev->ifindex;
1894         rth->dst.dev    = init_net.loopback_dev;
1895         dev_hold(rth->dst.dev);
1896         rth->idev       = in_dev_get(rth->dst.dev);
1897         rth->fl.oif     = 0;
1898         rth->rt_gateway = daddr;
1899         rth->rt_spec_dst= spec_dst;
1900         rth->rt_genid   = rt_genid(dev_net(dev));
1901         rth->rt_flags   = RTCF_MULTICAST;
1902         rth->rt_type    = RTN_MULTICAST;
1903         if (our) {
1904                 rth->dst.input= ip_local_deliver;
1905                 rth->rt_flags |= RTCF_LOCAL;
1906         }
1907
1908 #ifdef CONFIG_IP_MROUTE
1909         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1910                 rth->dst.input = ip_mr_input;
1911 #endif
1912         RT_CACHE_STAT_INC(in_slow_mc);
1913
1914         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1915         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1916
1917 e_nobufs:
1918         return -ENOBUFS;
1919 e_inval:
1920         return -EINVAL;
1921 e_err:
1922         return err;
1923 }
1924
1925
1926 static void ip_handle_martian_source(struct net_device *dev,
1927                                      struct in_device *in_dev,
1928                                      struct sk_buff *skb,
1929                                      __be32 daddr,
1930                                      __be32 saddr)
1931 {
1932         RT_CACHE_STAT_INC(in_martian_src);
1933 #ifdef CONFIG_IP_ROUTE_VERBOSE
1934         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1935                 /*
1936                  *      RFC1812 recommendation, if source is martian,
1937                  *      the only hint is MAC header.
1938                  */
1939                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1940                         &daddr, &saddr, dev->name);
1941                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1942                         int i;
1943                         const unsigned char *p = skb_mac_header(skb);
1944                         printk(KERN_WARNING "ll header: ");
1945                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1946                                 printk("%02x", *p);
1947                                 if (i < (dev->hard_header_len - 1))
1948                                         printk(":");
1949                         }
1950                         printk("\n");
1951                 }
1952         }
1953 #endif
1954 }
1955
1956 /* called in rcu_read_lock() section */
1957 static int __mkroute_input(struct sk_buff *skb,
1958                            struct fib_result *res,
1959                            struct in_device *in_dev,
1960                            __be32 daddr, __be32 saddr, u32 tos,
1961                            struct rtable **result)
1962 {
1963         struct rtable *rth;
1964         int err;
1965         struct in_device *out_dev;
1966         unsigned int flags = 0;
1967         __be32 spec_dst;
1968         u32 itag;
1969
1970         /* get a working reference to the output device */
1971         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1972         if (out_dev == NULL) {
1973                 if (net_ratelimit())
1974                         printk(KERN_CRIT "Bug in ip_route_input" \
1975                                "_slow(). Please, report\n");
1976                 return -EINVAL;
1977         }
1978
1979
1980         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1981                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1982         if (err < 0) {
1983                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1984                                          saddr);
1985
1986                 goto cleanup;
1987         }
1988
1989         if (err)
1990                 flags |= RTCF_DIRECTSRC;
1991
1992         if (out_dev == in_dev && err &&
1993             (IN_DEV_SHARED_MEDIA(out_dev) ||
1994              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1995                 flags |= RTCF_DOREDIRECT;
1996
1997         if (skb->protocol != htons(ETH_P_IP)) {
1998                 /* Not IP (i.e. ARP). Do not create route, if it is
1999                  * invalid for proxy arp. DNAT routes are always valid.
2000                  *
2001                  * Proxy arp feature have been extended to allow, ARP
2002                  * replies back to the same interface, to support
2003                  * Private VLAN switch technologies. See arp.c.
2004                  */
2005                 if (out_dev == in_dev &&
2006                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2007                         err = -EINVAL;
2008                         goto cleanup;
2009                 }
2010         }
2011
2012
2013         rth = dst_alloc(&ipv4_dst_ops);
2014         if (!rth) {
2015                 err = -ENOBUFS;
2016                 goto cleanup;
2017         }
2018
2019         atomic_set(&rth->dst.__refcnt, 1);
2020         rth->dst.flags= DST_HOST;
2021         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2022                 rth->dst.flags |= DST_NOPOLICY;
2023         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2024                 rth->dst.flags |= DST_NOXFRM;
2025         rth->fl.fl4_dst = daddr;
2026         rth->rt_dst     = daddr;
2027         rth->fl.fl4_tos = tos;
2028         rth->fl.mark    = skb->mark;
2029         rth->fl.fl4_src = saddr;
2030         rth->rt_src     = saddr;
2031         rth->rt_gateway = daddr;
2032         rth->rt_iif     =
2033                 rth->fl.iif     = in_dev->dev->ifindex;
2034         rth->dst.dev    = (out_dev)->dev;
2035         dev_hold(rth->dst.dev);
2036         rth->idev       = in_dev_get(rth->dst.dev);
2037         rth->fl.oif     = 0;
2038         rth->rt_spec_dst= spec_dst;
2039
2040         rth->dst.obsolete = -1;
2041         rth->dst.input = ip_forward;
2042         rth->dst.output = ip_output;
2043         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2044
2045         rt_set_nexthop(rth, res, itag);
2046
2047         rth->rt_flags = flags;
2048
2049         *result = rth;
2050         err = 0;
2051  cleanup:
2052         return err;
2053 }
2054
2055 static int ip_mkroute_input(struct sk_buff *skb,
2056                             struct fib_result *res,
2057                             const struct flowi *fl,
2058                             struct in_device *in_dev,
2059                             __be32 daddr, __be32 saddr, u32 tos)
2060 {
2061         struct rtable* rth = NULL;
2062         int err;
2063         unsigned hash;
2064
2065 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2066         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2067                 fib_select_multipath(fl, res);
2068 #endif
2069
2070         /* create a routing cache entry */
2071         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2072         if (err)
2073                 return err;
2074
2075         /* put it into the cache */
2076         hash = rt_hash(daddr, saddr, fl->iif,
2077                        rt_genid(dev_net(rth->dst.dev)));
2078         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2079 }
2080
2081 /*
2082  *      NOTE. We drop all the packets that has local source
2083  *      addresses, because every properly looped back packet
2084  *      must have correct destination already attached by output routine.
2085  *
2086  *      Such approach solves two big problems:
2087  *      1. Not simplex devices are handled properly.
2088  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2089  */
2090
2091 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2092                                u8 tos, struct net_device *dev)
2093 {
2094         struct fib_result res;
2095         struct in_device *in_dev = __in_dev_get_rcu(dev);
2096         struct flowi fl = { .nl_u = { .ip4_u =
2097                                       { .daddr = daddr,
2098                                         .saddr = saddr,
2099                                         .tos = tos,
2100                                         .scope = RT_SCOPE_UNIVERSE,
2101                                       } },
2102                             .mark = skb->mark,
2103                             .iif = dev->ifindex };
2104         unsigned        flags = 0;
2105         u32             itag = 0;
2106         struct rtable * rth;
2107         unsigned        hash;
2108         __be32          spec_dst;
2109         int             err = -EINVAL;
2110         int             free_res = 0;
2111         struct net    * net = dev_net(dev);
2112
2113         /* IP on this device is disabled. */
2114
2115         if (!in_dev)
2116                 goto out;
2117
2118         /* Check for the most weird martians, which can be not detected
2119            by fib_lookup.
2120          */
2121
2122         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2123             ipv4_is_loopback(saddr))
2124                 goto martian_source;
2125
2126         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2127                 goto brd_input;
2128
2129         /* Accept zero addresses only to limited broadcast;
2130          * I even do not know to fix it or not. Waiting for complains :-)
2131          */
2132         if (ipv4_is_zeronet(saddr))
2133                 goto martian_source;
2134
2135         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2136             ipv4_is_loopback(daddr))
2137                 goto martian_destination;
2138
2139         /*
2140          *      Now we are ready to route packet.
2141          */
2142         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2143                 if (!IN_DEV_FORWARD(in_dev))
2144                         goto e_hostunreach;
2145                 goto no_route;
2146         }
2147         free_res = 1;
2148
2149         RT_CACHE_STAT_INC(in_slow_tot);
2150
2151         if (res.type == RTN_BROADCAST)
2152                 goto brd_input;
2153
2154         if (res.type == RTN_LOCAL) {
2155                 err = fib_validate_source(saddr, daddr, tos,
2156                                              net->loopback_dev->ifindex,
2157                                              dev, &spec_dst, &itag, skb->mark);
2158                 if (err < 0)
2159                         goto martian_source_keep_err;
2160                 if (err)
2161                         flags |= RTCF_DIRECTSRC;
2162                 spec_dst = daddr;
2163                 goto local_input;
2164         }
2165
2166         if (!IN_DEV_FORWARD(in_dev))
2167                 goto e_hostunreach;
2168         if (res.type != RTN_UNICAST)
2169                 goto martian_destination;
2170
2171         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2172 done:
2173         if (free_res)
2174                 fib_res_put(&res);
2175 out:    return err;
2176
2177 brd_input:
2178         if (skb->protocol != htons(ETH_P_IP))
2179                 goto e_inval;
2180
2181         if (ipv4_is_zeronet(saddr))
2182                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2183         else {
2184                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2185                                           &itag, skb->mark);
2186                 if (err < 0)
2187                         goto martian_source_keep_err;
2188                 if (err)
2189                         flags |= RTCF_DIRECTSRC;
2190         }
2191         flags |= RTCF_BROADCAST;
2192         res.type = RTN_BROADCAST;
2193         RT_CACHE_STAT_INC(in_brd);
2194
2195 local_input:
2196         rth = dst_alloc(&ipv4_dst_ops);
2197         if (!rth)
2198                 goto e_nobufs;
2199
2200         rth->dst.output= ip_rt_bug;
2201         rth->dst.obsolete = -1;
2202         rth->rt_genid = rt_genid(net);
2203
2204         atomic_set(&rth->dst.__refcnt, 1);
2205         rth->dst.flags= DST_HOST;
2206         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2207                 rth->dst.flags |= DST_NOPOLICY;
2208         rth->fl.fl4_dst = daddr;
2209         rth->rt_dst     = daddr;
2210         rth->fl.fl4_tos = tos;
2211         rth->fl.mark    = skb->mark;
2212         rth->fl.fl4_src = saddr;
2213         rth->rt_src     = saddr;
2214 #ifdef CONFIG_NET_CLS_ROUTE
2215         rth->dst.tclassid = itag;
2216 #endif
2217         rth->rt_iif     =
2218         rth->fl.iif     = dev->ifindex;
2219         rth->dst.dev    = net->loopback_dev;
2220         dev_hold(rth->dst.dev);
2221         rth->idev       = in_dev_get(rth->dst.dev);
2222         rth->rt_gateway = daddr;
2223         rth->rt_spec_dst= spec_dst;
2224         rth->dst.input= ip_local_deliver;
2225         rth->rt_flags   = flags|RTCF_LOCAL;
2226         if (res.type == RTN_UNREACHABLE) {
2227                 rth->dst.input= ip_error;
2228                 rth->dst.error= -err;
2229                 rth->rt_flags   &= ~RTCF_LOCAL;
2230         }
2231         rth->rt_type    = res.type;
2232         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2233         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2234         goto done;
2235
2236 no_route:
2237         RT_CACHE_STAT_INC(in_no_route);
2238         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2239         res.type = RTN_UNREACHABLE;
2240         if (err == -ESRCH)
2241                 err = -ENETUNREACH;
2242         goto local_input;
2243
2244         /*
2245          *      Do not cache martian addresses: they should be logged (RFC1812)
2246          */
2247 martian_destination:
2248         RT_CACHE_STAT_INC(in_martian_dst);
2249 #ifdef CONFIG_IP_ROUTE_VERBOSE
2250         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2251                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2252                         &daddr, &saddr, dev->name);
2253 #endif
2254
2255 e_hostunreach:
2256         err = -EHOSTUNREACH;
2257         goto done;
2258
2259 e_inval:
2260         err = -EINVAL;
2261         goto done;
2262
2263 e_nobufs:
2264         err = -ENOBUFS;
2265         goto done;
2266
2267 martian_source:
2268         err = -EINVAL;
2269 martian_source_keep_err:
2270         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2271         goto done;
2272 }
2273
2274 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2275                            u8 tos, struct net_device *dev, bool noref)
2276 {
2277         struct rtable * rth;
2278         unsigned        hash;
2279         int iif = dev->ifindex;
2280         struct net *net;
2281         int res;
2282
2283         net = dev_net(dev);
2284
2285         rcu_read_lock();
2286
2287         if (!rt_caching(net))
2288                 goto skip_cache;
2289
2290         tos &= IPTOS_RT_MASK;
2291         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2292
2293         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2294              rth = rcu_dereference(rth->dst.rt_next)) {
2295                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2296                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2297                      (rth->fl.iif ^ iif) |
2298                      rth->fl.oif |
2299                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2300                     rth->fl.mark == skb->mark &&
2301                     net_eq(dev_net(rth->dst.dev), net) &&
2302                     !rt_is_expired(rth)) {
2303                         if (noref) {
2304                                 dst_use_noref(&rth->dst, jiffies);
2305                                 skb_dst_set_noref(skb, &rth->dst);
2306                         } else {
2307                                 dst_use(&rth->dst, jiffies);
2308                                 skb_dst_set(skb, &rth->dst);
2309                         }
2310                         RT_CACHE_STAT_INC(in_hit);
2311                         rcu_read_unlock();
2312                         return 0;
2313                 }
2314                 RT_CACHE_STAT_INC(in_hlist_search);
2315         }
2316
2317 skip_cache:
2318         /* Multicast recognition logic is moved from route cache to here.
2319            The problem was that too many Ethernet cards have broken/missing
2320            hardware multicast filters :-( As result the host on multicasting
2321            network acquires a lot of useless route cache entries, sort of
2322            SDR messages from all the world. Now we try to get rid of them.
2323            Really, provided software IP multicast filter is organized
2324            reasonably (at least, hashed), it does not result in a slowdown
2325            comparing with route cache reject entries.
2326            Note, that multicast routers are not affected, because
2327            route cache entry is created eventually.
2328          */
2329         if (ipv4_is_multicast(daddr)) {
2330                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2331
2332                 if (in_dev) {
2333                         int our = ip_check_mc(in_dev, daddr, saddr,
2334                                               ip_hdr(skb)->protocol);
2335                         if (our
2336 #ifdef CONFIG_IP_MROUTE
2337                                 ||
2338                             (!ipv4_is_local_multicast(daddr) &&
2339                              IN_DEV_MFORWARD(in_dev))
2340 #endif
2341                            ) {
2342                                 int res = ip_route_input_mc(skb, daddr, saddr,
2343                                                             tos, dev, our);
2344                                 rcu_read_unlock();
2345                                 return res;
2346                         }
2347                 }
2348                 rcu_read_unlock();
2349                 return -EINVAL;
2350         }
2351         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2352         rcu_read_unlock();
2353         return res;
2354 }
2355 EXPORT_SYMBOL(ip_route_input_common);
2356
2357 static int __mkroute_output(struct rtable **result,
2358                             struct fib_result *res,
2359                             const struct flowi *fl,
2360                             const struct flowi *oldflp,
2361                             struct net_device *dev_out,
2362                             unsigned flags)
2363 {
2364         struct rtable *rth;
2365         struct in_device *in_dev;
2366         u32 tos = RT_FL_TOS(oldflp);
2367         int err = 0;
2368
2369         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2370                 return -EINVAL;
2371
2372         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2373                 res->type = RTN_BROADCAST;
2374         else if (ipv4_is_multicast(fl->fl4_dst))
2375                 res->type = RTN_MULTICAST;
2376         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2377                 return -EINVAL;
2378
2379         if (dev_out->flags & IFF_LOOPBACK)
2380                 flags |= RTCF_LOCAL;
2381
2382         /* get work reference to inet device */
2383         in_dev = in_dev_get(dev_out);
2384         if (!in_dev)
2385                 return -EINVAL;
2386
2387         if (res->type == RTN_BROADCAST) {
2388                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2389                 if (res->fi) {
2390                         fib_info_put(res->fi);
2391                         res->fi = NULL;
2392                 }
2393         } else if (res->type == RTN_MULTICAST) {
2394                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2395                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2396                                  oldflp->proto))
2397                         flags &= ~RTCF_LOCAL;
2398                 /* If multicast route do not exist use
2399                    default one, but do not gateway in this case.
2400                    Yes, it is hack.
2401                  */
2402                 if (res->fi && res->prefixlen < 4) {
2403                         fib_info_put(res->fi);
2404                         res->fi = NULL;
2405                 }
2406         }
2407
2408
2409         rth = dst_alloc(&ipv4_dst_ops);
2410         if (!rth) {
2411                 err = -ENOBUFS;
2412                 goto cleanup;
2413         }
2414
2415         atomic_set(&rth->dst.__refcnt, 1);
2416         rth->dst.flags= DST_HOST;
2417         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2418                 rth->dst.flags |= DST_NOXFRM;
2419         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2420                 rth->dst.flags |= DST_NOPOLICY;
2421
2422         rth->fl.fl4_dst = oldflp->fl4_dst;
2423         rth->fl.fl4_tos = tos;
2424         rth->fl.fl4_src = oldflp->fl4_src;
2425         rth->fl.oif     = oldflp->oif;
2426         rth->fl.mark    = oldflp->mark;
2427         rth->rt_dst     = fl->fl4_dst;
2428         rth->rt_src     = fl->fl4_src;
2429         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2430         /* get references to the devices that are to be hold by the routing
2431            cache entry */
2432         rth->dst.dev    = dev_out;
2433         dev_hold(dev_out);
2434         rth->idev       = in_dev_get(dev_out);
2435         rth->rt_gateway = fl->fl4_dst;
2436         rth->rt_spec_dst= fl->fl4_src;
2437
2438         rth->dst.output=ip_output;
2439         rth->dst.obsolete = -1;
2440         rth->rt_genid = rt_genid(dev_net(dev_out));
2441
2442         RT_CACHE_STAT_INC(out_slow_tot);
2443
2444         if (flags & RTCF_LOCAL) {
2445                 rth->dst.input = ip_local_deliver;
2446                 rth->rt_spec_dst = fl->fl4_dst;
2447         }
2448         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2449                 rth->rt_spec_dst = fl->fl4_src;
2450                 if (flags & RTCF_LOCAL &&
2451                     !(dev_out->flags & IFF_LOOPBACK)) {
2452                         rth->dst.output = ip_mc_output;
2453                         RT_CACHE_STAT_INC(out_slow_mc);
2454                 }
2455 #ifdef CONFIG_IP_MROUTE
2456                 if (res->type == RTN_MULTICAST) {
2457                         if (IN_DEV_MFORWARD(in_dev) &&
2458                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2459                                 rth->dst.input = ip_mr_input;
2460                                 rth->dst.output = ip_mc_output;
2461                         }
2462                 }
2463 #endif
2464         }
2465
2466         rt_set_nexthop(rth, res, 0);
2467
2468         rth->rt_flags = flags;
2469
2470         *result = rth;
2471  cleanup:
2472         /* release work reference to inet device */
2473         in_dev_put(in_dev);
2474
2475         return err;
2476 }
2477
2478 static int ip_mkroute_output(struct rtable **rp,
2479                              struct fib_result *res,
2480                              const struct flowi *fl,
2481                              const struct flowi *oldflp,
2482                              struct net_device *dev_out,
2483                              unsigned flags)
2484 {
2485         struct rtable *rth = NULL;
2486         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2487         unsigned hash;
2488         if (err == 0) {
2489                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2490                                rt_genid(dev_net(dev_out)));
2491                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2492         }
2493
2494         return err;
2495 }
2496
2497 /*
2498  * Major route resolver routine.
2499  */
2500
2501 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2502                                 const struct flowi *oldflp)
2503 {
2504         u32 tos = RT_FL_TOS(oldflp);
2505         struct flowi fl = { .nl_u = { .ip4_u =
2506                                       { .daddr = oldflp->fl4_dst,
2507                                         .saddr = oldflp->fl4_src,
2508                                         .tos = tos & IPTOS_RT_MASK,
2509                                         .scope = ((tos & RTO_ONLINK) ?
2510                                                   RT_SCOPE_LINK :
2511                                                   RT_SCOPE_UNIVERSE),
2512                                       } },
2513                             .mark = oldflp->mark,
2514                             .iif = net->loopback_dev->ifindex,
2515                             .oif = oldflp->oif };
2516         struct fib_result res;
2517         unsigned flags = 0;
2518         struct net_device *dev_out = NULL;
2519         int free_res = 0;
2520         int err;
2521
2522
2523         res.fi          = NULL;
2524 #ifdef CONFIG_IP_MULTIPLE_TABLES
2525         res.r           = NULL;
2526 #endif
2527
2528         if (oldflp->fl4_src) {
2529                 err = -EINVAL;
2530                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2531                     ipv4_is_lbcast(oldflp->fl4_src) ||
2532                     ipv4_is_zeronet(oldflp->fl4_src))
2533                         goto out;
2534
2535                 /* I removed check for oif == dev_out->oif here.
2536                    It was wrong for two reasons:
2537                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2538                       is assigned to multiple interfaces.
2539                    2. Moreover, we are allowed to send packets with saddr
2540                       of another iface. --ANK
2541                  */
2542
2543                 if (oldflp->oif == 0 &&
2544                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2545                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2546                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2547                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2548                         if (dev_out == NULL)
2549                                 goto out;
2550
2551                         /* Special hack: user can direct multicasts
2552                            and limited broadcast via necessary interface
2553                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2554                            This hack is not just for fun, it allows
2555                            vic,vat and friends to work.
2556                            They bind socket to loopback, set ttl to zero
2557                            and expect that it will work.
2558                            From the viewpoint of routing cache they are broken,
2559                            because we are not allowed to build multicast path
2560                            with loopback source addr (look, routing cache
2561                            cannot know, that ttl is zero, so that packet
2562                            will not leave this host and route is valid).
2563                            Luckily, this hack is good workaround.
2564                          */
2565
2566                         fl.oif = dev_out->ifindex;
2567                         goto make_route;
2568                 }
2569
2570                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2571                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2572                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2573                         if (dev_out == NULL)
2574                                 goto out;
2575                         dev_put(dev_out);
2576                         dev_out = NULL;
2577                 }
2578         }
2579
2580
2581         if (oldflp->oif) {
2582                 dev_out = dev_get_by_index(net, oldflp->oif);
2583                 err = -ENODEV;
2584                 if (dev_out == NULL)
2585                         goto out;
2586
2587                 /* RACE: Check return value of inet_select_addr instead. */
2588                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2589                         dev_put(dev_out);
2590                         goto out;       /* Wrong error code */
2591                 }
2592
2593                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2594                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2595                         if (!fl.fl4_src)
2596                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2597                                                               RT_SCOPE_LINK);
2598                         goto make_route;
2599                 }
2600                 if (!fl.fl4_src) {
2601                         if (ipv4_is_multicast(oldflp->fl4_dst))
2602                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2603                                                               fl.fl4_scope);
2604                         else if (!oldflp->fl4_dst)
2605                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2606                                                               RT_SCOPE_HOST);
2607                 }
2608         }
2609
2610         if (!fl.fl4_dst) {
2611                 fl.fl4_dst = fl.fl4_src;
2612                 if (!fl.fl4_dst)
2613                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2614                 if (dev_out)
2615                         dev_put(dev_out);
2616                 dev_out = net->loopback_dev;
2617                 dev_hold(dev_out);
2618                 fl.oif = net->loopback_dev->ifindex;
2619                 res.type = RTN_LOCAL;
2620                 flags |= RTCF_LOCAL;
2621                 goto make_route;
2622         }
2623
2624         if (fib_lookup(net, &fl, &res)) {
2625                 res.fi = NULL;
2626                 if (oldflp->oif) {
2627                         /* Apparently, routing tables are wrong. Assume,
2628                            that the destination is on link.
2629
2630                            WHY? DW.
2631                            Because we are allowed to send to iface
2632                            even if it has NO routes and NO assigned
2633                            addresses. When oif is specified, routing
2634                            tables are looked up with only one purpose:
2635                            to catch if destination is gatewayed, rather than
2636                            direct. Moreover, if MSG_DONTROUTE is set,
2637                            we send packet, ignoring both routing tables
2638                            and ifaddr state. --ANK
2639
2640
2641                            We could make it even if oif is unknown,
2642                            likely IPv6, but we do not.
2643                          */
2644
2645                         if (fl.fl4_src == 0)
2646                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2647                                                               RT_SCOPE_LINK);
2648                         res.type = RTN_UNICAST;
2649                         goto make_route;
2650                 }
2651                 if (dev_out)
2652                         dev_put(dev_out);
2653                 err = -ENETUNREACH;
2654                 goto out;
2655         }
2656         free_res = 1;
2657
2658         if (res.type == RTN_LOCAL) {
2659                 if (!fl.fl4_src)
2660                         fl.fl4_src = fl.fl4_dst;
2661                 if (dev_out)
2662                         dev_put(dev_out);
2663                 dev_out = net->loopback_dev;
2664                 dev_hold(dev_out);
2665                 fl.oif = dev_out->ifindex;
2666                 if (res.fi)
2667                         fib_info_put(res.fi);
2668                 res.fi = NULL;
2669                 flags |= RTCF_LOCAL;
2670                 goto make_route;
2671         }
2672
2673 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2674         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2675                 fib_select_multipath(&fl, &res);
2676         else
2677 #endif
2678         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2679                 fib_select_default(net, &fl, &res);
2680
2681         if (!fl.fl4_src)
2682                 fl.fl4_src = FIB_RES_PREFSRC(res);
2683
2684         if (dev_out)
2685                 dev_put(dev_out);
2686         dev_out = FIB_RES_DEV(res);
2687         dev_hold(dev_out);
2688         fl.oif = dev_out->ifindex;
2689
2690
2691 make_route:
2692         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2693
2694
2695         if (free_res)
2696                 fib_res_put(&res);
2697         if (dev_out)
2698                 dev_put(dev_out);
2699 out:    return err;
2700 }
2701
2702 int __ip_route_output_key(struct net *net, struct rtable **rp,
2703                           const struct flowi *flp)
2704 {
2705         unsigned hash;
2706         struct rtable *rth;
2707
2708         if (!rt_caching(net))
2709                 goto slow_output;
2710
2711         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2712
2713         rcu_read_lock_bh();
2714         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2715                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2716                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2717                     rth->fl.fl4_src == flp->fl4_src &&
2718                     rth->fl.iif == 0 &&
2719                     rth->fl.oif == flp->oif &&
2720                     rth->fl.mark == flp->mark &&
2721                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2722                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2723                     net_eq(dev_net(rth->dst.dev), net) &&
2724                     !rt_is_expired(rth)) {
2725                         dst_use(&rth->dst, jiffies);
2726                         RT_CACHE_STAT_INC(out_hit);
2727                         rcu_read_unlock_bh();
2728                         *rp = rth;
2729                         return 0;
2730                 }
2731                 RT_CACHE_STAT_INC(out_hlist_search);
2732         }
2733         rcu_read_unlock_bh();
2734
2735 slow_output:
2736         return ip_route_output_slow(net, rp, flp);
2737 }
2738
2739 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740
2741 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2742 {
2743 }
2744
2745 static struct dst_ops ipv4_dst_blackhole_ops = {
2746         .family                 =       AF_INET,
2747         .protocol               =       cpu_to_be16(ETH_P_IP),
2748         .destroy                =       ipv4_dst_destroy,
2749         .check                  =       ipv4_dst_check,
2750         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2751         .entries                =       ATOMIC_INIT(0),
2752 };
2753
2754
2755 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2756 {
2757         struct rtable *ort = *rp;
2758         struct rtable *rt = (struct rtable *)
2759                 dst_alloc(&ipv4_dst_blackhole_ops);
2760
2761         if (rt) {
2762                 struct dst_entry *new = &rt->dst;
2763
2764                 atomic_set(&new->__refcnt, 1);
2765                 new->__use = 1;
2766                 new->input = dst_discard;
2767                 new->output = dst_discard;
2768                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2769
2770                 new->dev = ort->dst.dev;
2771                 if (new->dev)
2772                         dev_hold(new->dev);
2773
2774                 rt->fl = ort->fl;
2775
2776                 rt->idev = ort->idev;
2777                 if (rt->idev)
2778                         in_dev_hold(rt->idev);
2779                 rt->rt_genid = rt_genid(net);
2780                 rt->rt_flags = ort->rt_flags;
2781                 rt->rt_type = ort->rt_type;
2782                 rt->rt_dst = ort->rt_dst;
2783                 rt->rt_src = ort->rt_src;
2784                 rt->rt_iif = ort->rt_iif;
2785                 rt->rt_gateway = ort->rt_gateway;
2786                 rt->rt_spec_dst = ort->rt_spec_dst;
2787                 rt->peer = ort->peer;
2788                 if (rt->peer)
2789                         atomic_inc(&rt->peer->refcnt);
2790
2791                 dst_free(new);
2792         }
2793
2794         dst_release(&(*rp)->dst);
2795         *rp = rt;
2796         return (rt ? 0 : -ENOMEM);
2797 }
2798
2799 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2800                          struct sock *sk, int flags)
2801 {
2802         int err;
2803
2804         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2805                 return err;
2806
2807         if (flp->proto) {
2808                 if (!flp->fl4_src)
2809                         flp->fl4_src = (*rp)->rt_src;
2810                 if (!flp->fl4_dst)
2811                         flp->fl4_dst = (*rp)->rt_dst;
2812                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2813                                     flags ? XFRM_LOOKUP_WAIT : 0);
2814                 if (err == -EREMOTE)
2815                         err = ipv4_dst_blackhole(net, rp, flp);
2816
2817                 return err;
2818         }
2819
2820         return 0;
2821 }
2822
2823 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2824
2825 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2826 {
2827         return ip_route_output_flow(net, rp, flp, NULL, 0);
2828 }
2829
2830 static int rt_fill_info(struct net *net,
2831                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2832                         int nowait, unsigned int flags)
2833 {
2834         struct rtable *rt = skb_rtable(skb);
2835         struct rtmsg *r;
2836         struct nlmsghdr *nlh;
2837         long expires;
2838         u32 id = 0, ts = 0, tsage = 0, error;
2839
2840         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2841         if (nlh == NULL)
2842                 return -EMSGSIZE;
2843
2844         r = nlmsg_data(nlh);
2845         r->rtm_family    = AF_INET;
2846         r->rtm_dst_len  = 32;
2847         r->rtm_src_len  = 0;
2848         r->rtm_tos      = rt->fl.fl4_tos;
2849         r->rtm_table    = RT_TABLE_MAIN;
2850         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2851         r->rtm_type     = rt->rt_type;
2852         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2853         r->rtm_protocol = RTPROT_UNSPEC;
2854         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2855         if (rt->rt_flags & RTCF_NOTIFY)
2856                 r->rtm_flags |= RTM_F_NOTIFY;
2857
2858         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2859
2860         if (rt->fl.fl4_src) {
2861                 r->rtm_src_len = 32;
2862                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2863         }
2864         if (rt->dst.dev)
2865                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2866 #ifdef CONFIG_NET_CLS_ROUTE
2867         if (rt->dst.tclassid)
2868                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2869 #endif
2870         if (rt->fl.iif)
2871                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2872         else if (rt->rt_src != rt->fl.fl4_src)
2873                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2874
2875         if (rt->rt_dst != rt->rt_gateway)
2876                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2877
2878         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2879                 goto nla_put_failure;
2880
2881         error = rt->dst.error;
2882         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2883         if (rt->peer) {
2884                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2885                 if (rt->peer->tcp_ts_stamp) {
2886                         ts = rt->peer->tcp_ts;
2887                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2888                 }
2889         }
2890
2891         if (rt->fl.iif) {
2892 #ifdef CONFIG_IP_MROUTE
2893                 __be32 dst = rt->rt_dst;
2894
2895                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2896                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2897                         int err = ipmr_get_route(net, skb, r, nowait);
2898                         if (err <= 0) {
2899                                 if (!nowait) {
2900                                         if (err == 0)
2901                                                 return 0;
2902                                         goto nla_put_failure;
2903                                 } else {
2904                                         if (err == -EMSGSIZE)
2905                                                 goto nla_put_failure;
2906                                         error = err;
2907                                 }
2908                         }
2909                 } else
2910 #endif
2911                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2912         }
2913
2914         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2915                                expires, error) < 0)
2916                 goto nla_put_failure;
2917
2918         return nlmsg_end(skb, nlh);
2919
2920 nla_put_failure:
2921         nlmsg_cancel(skb, nlh);
2922         return -EMSGSIZE;
2923 }
2924
2925 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2926 {
2927         struct net *net = sock_net(in_skb->sk);
2928         struct rtmsg *rtm;
2929         struct nlattr *tb[RTA_MAX+1];
2930         struct rtable *rt = NULL;
2931         __be32 dst = 0;
2932         __be32 src = 0;
2933         u32 iif;
2934         int err;
2935         struct sk_buff *skb;
2936
2937         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2938         if (err < 0)
2939                 goto errout;
2940
2941         rtm = nlmsg_data(nlh);
2942
2943         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2944         if (skb == NULL) {
2945                 err = -ENOBUFS;
2946                 goto errout;
2947         }
2948
2949         /* Reserve room for dummy headers, this skb can pass
2950            through good chunk of routing engine.
2951          */
2952         skb_reset_mac_header(skb);
2953         skb_reset_network_header(skb);
2954
2955         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2956         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2957         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2958
2959         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2960         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2961         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2962
2963         if (iif) {
2964                 struct net_device *dev;
2965
2966                 dev = __dev_get_by_index(net, iif);
2967                 if (dev == NULL) {
2968                         err = -ENODEV;
2969                         goto errout_free;
2970                 }
2971
2972                 skb->protocol   = htons(ETH_P_IP);
2973                 skb->dev        = dev;
2974                 local_bh_disable();
2975                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2976                 local_bh_enable();
2977
2978                 rt = skb_rtable(skb);
2979                 if (err == 0 && rt->dst.error)
2980                         err = -rt->dst.error;
2981         } else {
2982                 struct flowi fl = {
2983                         .nl_u = {
2984                                 .ip4_u = {
2985                                         .daddr = dst,
2986                                         .saddr = src,
2987                                         .tos = rtm->rtm_tos,
2988                                 },
2989                         },
2990                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2991                 };
2992                 err = ip_route_output_key(net, &rt, &fl);
2993         }
2994
2995         if (err)
2996                 goto errout_free;
2997
2998         skb_dst_set(skb, &rt->dst);
2999         if (rtm->rtm_flags & RTM_F_NOTIFY)
3000                 rt->rt_flags |= RTCF_NOTIFY;
3001
3002         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3003                            RTM_NEWROUTE, 0, 0);
3004         if (err <= 0)
3005                 goto errout_free;
3006
3007         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3008 errout:
3009         return err;
3010
3011 errout_free:
3012         kfree_skb(skb);
3013         goto errout;
3014 }
3015
3016 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3017 {
3018         struct rtable *rt;
3019         int h, s_h;
3020         int idx, s_idx;
3021         struct net *net;
3022
3023         net = sock_net(skb->sk);
3024
3025         s_h = cb->args[0];
3026         if (s_h < 0)
3027                 s_h = 0;
3028         s_idx = idx = cb->args[1];
3029         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3030                 if (!rt_hash_table[h].chain)
3031                         continue;
3032                 rcu_read_lock_bh();
3033                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3034                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3035                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3036                                 continue;
3037                         if (rt_is_expired(rt))
3038                                 continue;
3039                         skb_dst_set_noref(skb, &rt->dst);
3040                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3041                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3042                                          1, NLM_F_MULTI) <= 0) {
3043                                 skb_dst_drop(skb);
3044                                 rcu_read_unlock_bh();
3045                                 goto done;
3046                         }
3047                         skb_dst_drop(skb);
3048                 }
3049                 rcu_read_unlock_bh();
3050         }
3051
3052 done:
3053         cb->args[0] = h;
3054         cb->args[1] = idx;
3055         return skb->len;
3056 }
3057
3058 void ip_rt_multicast_event(struct in_device *in_dev)
3059 {
3060         rt_cache_flush(dev_net(in_dev->dev), 0);
3061 }
3062
3063 #ifdef CONFIG_SYSCTL
3064 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3065                                         void __user *buffer,
3066                                         size_t *lenp, loff_t *ppos)
3067 {
3068         if (write) {
3069                 int flush_delay;
3070                 ctl_table ctl;
3071                 struct net *net;
3072
3073                 memcpy(&ctl, __ctl, sizeof(ctl));
3074                 ctl.data = &flush_delay;
3075                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3076
3077                 net = (struct net *)__ctl->extra1;
3078                 rt_cache_flush(net, flush_delay);
3079                 return 0;
3080         }
3081
3082         return -EINVAL;
3083 }
3084
3085 static ctl_table ipv4_route_table[] = {
3086         {
3087                 .procname       = "gc_thresh",
3088                 .data           = &ipv4_dst_ops.gc_thresh,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec,
3092         },
3093         {
3094                 .procname       = "max_size",
3095                 .data           = &ip_rt_max_size,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec,
3099         },
3100         {
3101                 /*  Deprecated. Use gc_min_interval_ms */
3102
3103                 .procname       = "gc_min_interval",
3104                 .data           = &ip_rt_gc_min_interval,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec_jiffies,
3108         },
3109         {
3110                 .procname       = "gc_min_interval_ms",
3111                 .data           = &ip_rt_gc_min_interval,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec_ms_jiffies,
3115         },
3116         {
3117                 .procname       = "gc_timeout",
3118                 .data           = &ip_rt_gc_timeout,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec_jiffies,
3122         },
3123         {
3124                 .procname       = "gc_interval",
3125                 .data           = &ip_rt_gc_interval,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_jiffies,
3129         },
3130         {
3131                 .procname       = "redirect_load",
3132                 .data           = &ip_rt_redirect_load,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 .procname       = "redirect_number",
3139                 .data           = &ip_rt_redirect_number,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "redirect_silence",
3146                 .data           = &ip_rt_redirect_silence,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "error_cost",
3153                 .data           = &ip_rt_error_cost,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 .procname       = "error_burst",
3160                 .data           = &ip_rt_error_burst,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "gc_elasticity",
3167                 .data           = &ip_rt_gc_elasticity,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .procname       = "mtu_expires",
3174                 .data           = &ip_rt_mtu_expires,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec_jiffies,
3178         },
3179         {
3180                 .procname       = "min_pmtu",
3181                 .data           = &ip_rt_min_pmtu,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "min_adv_mss",
3188                 .data           = &ip_rt_min_advmss,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         { }
3194 };
3195
3196 static struct ctl_table empty[1];
3197
3198 static struct ctl_table ipv4_skeleton[] =
3199 {
3200         { .procname = "route", 
3201           .mode = 0555, .child = ipv4_route_table},
3202         { .procname = "neigh", 
3203           .mode = 0555, .child = empty},
3204         { }
3205 };
3206
3207 static __net_initdata struct ctl_path ipv4_path[] = {
3208         { .procname = "net", },
3209         { .procname = "ipv4", },
3210         { },
3211 };
3212
3213 static struct ctl_table ipv4_route_flush_table[] = {
3214         {
3215                 .procname       = "flush",
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0200,
3218                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3219         },
3220         { },
3221 };
3222
3223 static __net_initdata struct ctl_path ipv4_route_path[] = {
3224         { .procname = "net", },
3225         { .procname = "ipv4", },
3226         { .procname = "route", },
3227         { },
3228 };
3229
3230 static __net_init int sysctl_route_net_init(struct net *net)
3231 {
3232         struct ctl_table *tbl;
3233
3234         tbl = ipv4_route_flush_table;
3235         if (!net_eq(net, &init_net)) {
3236                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3237                 if (tbl == NULL)
3238                         goto err_dup;
3239         }
3240         tbl[0].extra1 = net;
3241
3242         net->ipv4.route_hdr =
3243                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3244         if (net->ipv4.route_hdr == NULL)
3245                 goto err_reg;
3246         return 0;
3247
3248 err_reg:
3249         if (tbl != ipv4_route_flush_table)
3250                 kfree(tbl);
3251 err_dup:
3252         return -ENOMEM;
3253 }
3254
3255 static __net_exit void sysctl_route_net_exit(struct net *net)
3256 {
3257         struct ctl_table *tbl;
3258
3259         tbl = net->ipv4.route_hdr->ctl_table_arg;
3260         unregister_net_sysctl_table(net->ipv4.route_hdr);
3261         BUG_ON(tbl == ipv4_route_flush_table);
3262         kfree(tbl);
3263 }
3264
3265 static __net_initdata struct pernet_operations sysctl_route_ops = {
3266         .init = sysctl_route_net_init,
3267         .exit = sysctl_route_net_exit,
3268 };
3269 #endif
3270
3271 static __net_init int rt_genid_init(struct net *net)
3272 {
3273         get_random_bytes(&net->ipv4.rt_genid,
3274                          sizeof(net->ipv4.rt_genid));
3275         return 0;
3276 }
3277
3278 static __net_initdata struct pernet_operations rt_genid_ops = {
3279         .init = rt_genid_init,
3280 };
3281
3282
3283 #ifdef CONFIG_NET_CLS_ROUTE
3284 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3285 #endif /* CONFIG_NET_CLS_ROUTE */
3286
3287 static __initdata unsigned long rhash_entries;
3288 static int __init set_rhash_entries(char *str)
3289 {
3290         if (!str)
3291                 return 0;
3292         rhash_entries = simple_strtoul(str, &str, 0);
3293         return 1;
3294 }
3295 __setup("rhash_entries=", set_rhash_entries);
3296
3297 int __init ip_rt_init(void)
3298 {
3299         int rc = 0;
3300
3301 #ifdef CONFIG_NET_CLS_ROUTE
3302         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3303         if (!ip_rt_acct)
3304                 panic("IP: failed to allocate ip_rt_acct\n");
3305 #endif
3306
3307         ipv4_dst_ops.kmem_cachep =
3308                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3309                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3310
3311         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3312
3313         rt_hash_table = (struct rt_hash_bucket *)
3314                 alloc_large_system_hash("IP route cache",
3315                                         sizeof(struct rt_hash_bucket),
3316                                         rhash_entries,
3317                                         (totalram_pages >= 128 * 1024) ?
3318                                         15 : 17,
3319                                         0,
3320                                         &rt_hash_log,
3321                                         &rt_hash_mask,
3322                                         rhash_entries ? 0 : 512 * 1024);
3323         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3324         rt_hash_lock_init();
3325
3326         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3327         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3328
3329         devinet_init();
3330         ip_fib_init();
3331
3332         /* All the timers, started at system startup tend
3333            to synchronize. Perturb it a bit.
3334          */
3335         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3336         expires_ljiffies = jiffies;
3337         schedule_delayed_work(&expires_work,
3338                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3339
3340         if (ip_rt_proc_init())
3341                 printk(KERN_ERR "Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343         xfrm_init();
3344         xfrm4_init(ip_rt_max_size);
3345 #endif
3346         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3347
3348 #ifdef CONFIG_SYSCTL
3349         register_pernet_subsys(&sysctl_route_ops);
3350 #endif
3351         register_pernet_subsys(&rt_genid_ops);
3352         return rc;
3353 }
3354
3355 #ifdef CONFIG_SYSCTL
3356 /*
3357  * We really need to sanitize the damn ipv4 init order, then all
3358  * this nonsense will go away.
3359  */
3360 void __init ip_static_sysctl_init(void)
3361 {
3362         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3363 }
3364 #endif
3365
3366 EXPORT_SYMBOL(__ip_select_ident);
3367 EXPORT_SYMBOL(ip_route_output_key);