Merge branch '3.4-urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/nab/target...
[linux-flexiantxendom0-3.2.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #endif
113 #include <net/secure_seq.h>
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define IP_MAX_MTU      0xFFF0
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly  = 9;
127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly       = HZ;
130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly    = 8;
132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly       = 256;
135 static int rt_chain_length_max __read_mostly    = 20;
136
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139
140 /*
141  *      Interface to generic destination cache.
142  */
143
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
147 static void              ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void              ipv4_link_failure(struct sk_buff *skb);
150 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154                             int how)
155 {
156 }
157
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160         struct rtable *rt = (struct rtable *) dst;
161         struct inet_peer *peer;
162         u32 *p = NULL;
163
164         if (!rt->peer)
165                 rt_bind_peer(rt, rt->rt_dst, 1);
166
167         peer = rt->peer;
168         if (peer) {
169                 u32 *old_p = __DST_METRICS_PTR(old);
170                 unsigned long prev, new;
171
172                 p = peer->metrics;
173                 if (inet_metrics_new(peer))
174                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175
176                 new = (unsigned long) p;
177                 prev = cmpxchg(&dst->_metrics, old, new);
178
179                 if (prev != old) {
180                         p = __DST_METRICS_PTR(prev);
181                         if (prev & DST_METRICS_READ_ONLY)
182                                 p = NULL;
183                 } else {
184                         if (rt->fi) {
185                                 fib_info_put(rt->fi);
186                                 rt->fi = NULL;
187                         }
188                 }
189         }
190         return p;
191 }
192
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
195 static struct dst_ops ipv4_dst_ops = {
196         .family =               AF_INET,
197         .protocol =             cpu_to_be16(ETH_P_IP),
198         .gc =                   rt_garbage_collect,
199         .check =                ipv4_dst_check,
200         .default_advmss =       ipv4_default_advmss,
201         .mtu =                  ipv4_mtu,
202         .cow_metrics =          ipv4_cow_metrics,
203         .destroy =              ipv4_dst_destroy,
204         .ifdown =               ipv4_dst_ifdown,
205         .negative_advice =      ipv4_negative_advice,
206         .link_failure =         ipv4_link_failure,
207         .update_pmtu =          ip_rt_update_pmtu,
208         .local_out =            __ip_local_out,
209         .neigh_lookup =         ipv4_neigh_lookup,
210 };
211
212 #define ECN_OR_COST(class)      TC_PRIO_##class
213
214 const __u8 ip_tos2prio[16] = {
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BESTEFFORT,
218         ECN_OR_COST(BESTEFFORT),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_BULK,
222         ECN_OR_COST(BULK),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE,
226         ECN_OR_COST(INTERACTIVE),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK),
229         TC_PRIO_INTERACTIVE_BULK,
230         ECN_OR_COST(INTERACTIVE_BULK)
231 };
232
233
234 /*
235  * Route cache.
236  */
237
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247
248 struct rt_hash_bucket {
249         struct rtable __rcu     *chain;
250 };
251
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253         defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ        256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ       4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ       2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ       1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ       512
270 # else
271 #  define RT_HASH_LOCK_SZ       256
272 # endif
273 #endif
274
275 static spinlock_t       *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277
278 static __init void rt_hash_lock_init(void)
279 {
280         int i;
281
282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283                         GFP_KERNEL);
284         if (!rt_hash_locks)
285                 panic("IP: failed to allocate rt_hash_locks\n");
286
287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288                 spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297
298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
299 static unsigned                 rt_hash_mask __read_mostly;
300 static unsigned int             rt_hash_log  __read_mostly;
301
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306                                    int genid)
307 {
308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
309                             idx, genid)
310                 & rt_hash_mask;
311 }
312
313 static inline int rt_genid(struct net *net)
314 {
315         return atomic_read(&net->ipv4.rt_genid);
316 }
317
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320         struct seq_net_private p;
321         int bucket;
322         int genid;
323 };
324
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327         struct rt_cache_iter_state *st = seq->private;
328         struct rtable *r = NULL;
329
330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332                         continue;
333                 rcu_read_lock_bh();
334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335                 while (r) {
336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337                             r->rt_genid == st->genid)
338                                 return r;
339                         r = rcu_dereference_bh(r->dst.rt_next);
340                 }
341                 rcu_read_unlock_bh();
342         }
343         return r;
344 }
345
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347                                           struct rtable *r)
348 {
349         struct rt_cache_iter_state *st = seq->private;
350
351         r = rcu_dereference_bh(r->dst.rt_next);
352         while (!r) {
353                 rcu_read_unlock_bh();
354                 do {
355                         if (--st->bucket < 0)
356                                 return NULL;
357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358                 rcu_read_lock_bh();
359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360         }
361         return r;
362 }
363
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365                                         struct rtable *r)
366 {
367         struct rt_cache_iter_state *st = seq->private;
368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
370                         continue;
371                 if (r->rt_genid == st->genid)
372                         break;
373         }
374         return r;
375 }
376
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379         struct rtable *r = rt_cache_get_first(seq);
380
381         if (r)
382                 while (pos && (r = rt_cache_get_next(seq, r)))
383                         --pos;
384         return pos ? NULL : r;
385 }
386
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389         struct rt_cache_iter_state *st = seq->private;
390         if (*pos)
391                 return rt_cache_get_idx(seq, *pos - 1);
392         st->genid = rt_genid(seq_file_net(seq));
393         return SEQ_START_TOKEN;
394 }
395
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398         struct rtable *r;
399
400         if (v == SEQ_START_TOKEN)
401                 r = rt_cache_get_first(seq);
402         else
403                 r = rt_cache_get_next(seq, v);
404         ++*pos;
405         return r;
406 }
407
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410         if (v && v != SEQ_START_TOKEN)
411                 rcu_read_unlock_bh();
412 }
413
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416         if (v == SEQ_START_TOKEN)
417                 seq_printf(seq, "%-127s\n",
418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420                            "HHUptod\tSpecDst");
421         else {
422                 struct rtable *r = v;
423                 struct neighbour *n;
424                 int len, HHUptod;
425
426                 rcu_read_lock();
427                 n = dst_get_neighbour_noref(&r->dst);
428                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429                 rcu_read_unlock();
430
431                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433                         r->dst.dev ? r->dst.dev->name : "*",
434                         (__force u32)r->rt_dst,
435                         (__force u32)r->rt_gateway,
436                         r->rt_flags, atomic_read(&r->dst.__refcnt),
437                         r->dst.__use, 0, (__force u32)r->rt_src,
438                         dst_metric_advmss(&r->dst) + 40,
439                         dst_metric(&r->dst, RTAX_WINDOW),
440                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441                               dst_metric(&r->dst, RTAX_RTTVAR)),
442                         r->rt_key_tos,
443                         -1,
444                         HHUptod,
445                         r->rt_spec_dst, &len);
446
447                 seq_printf(seq, "%*s\n", 127 - len, "");
448         }
449         return 0;
450 }
451
452 static const struct seq_operations rt_cache_seq_ops = {
453         .start  = rt_cache_seq_start,
454         .next   = rt_cache_seq_next,
455         .stop   = rt_cache_seq_stop,
456         .show   = rt_cache_seq_show,
457 };
458
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461         return seq_open_net(inode, file, &rt_cache_seq_ops,
462                         sizeof(struct rt_cache_iter_state));
463 }
464
465 static const struct file_operations rt_cache_seq_fops = {
466         .owner   = THIS_MODULE,
467         .open    = rt_cache_seq_open,
468         .read    = seq_read,
469         .llseek  = seq_lseek,
470         .release = seq_release_net,
471 };
472
473
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476         int cpu;
477
478         if (*pos == 0)
479                 return SEQ_START_TOKEN;
480
481         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488 }
489
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492         int cpu;
493
494         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495                 if (!cpu_possible(cpu))
496                         continue;
497                 *pos = cpu+1;
498                 return &per_cpu(rt_cache_stat, cpu);
499         }
500         return NULL;
501
502 }
503
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506
507 }
508
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511         struct rt_cache_stat *st = v;
512
513         if (v == SEQ_START_TOKEN) {
514                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515                 return 0;
516         }
517
518         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520                    dst_entries_get_slow(&ipv4_dst_ops),
521                    st->in_hit,
522                    st->in_slow_tot,
523                    st->in_slow_mc,
524                    st->in_no_route,
525                    st->in_brd,
526                    st->in_martian_dst,
527                    st->in_martian_src,
528
529                    st->out_hit,
530                    st->out_slow_tot,
531                    st->out_slow_mc,
532
533                    st->gc_total,
534                    st->gc_ignored,
535                    st->gc_goal_miss,
536                    st->gc_dst_overflow,
537                    st->in_hlist_search,
538                    st->out_hlist_search
539                 );
540         return 0;
541 }
542
543 static const struct seq_operations rt_cpu_seq_ops = {
544         .start  = rt_cpu_seq_start,
545         .next   = rt_cpu_seq_next,
546         .stop   = rt_cpu_seq_stop,
547         .show   = rt_cpu_seq_show,
548 };
549
550
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553         return seq_open(file, &rt_cpu_seq_ops);
554 }
555
556 static const struct file_operations rt_cpu_seq_fops = {
557         .owner   = THIS_MODULE,
558         .open    = rt_cpu_seq_open,
559         .read    = seq_read,
560         .llseek  = seq_lseek,
561         .release = seq_release,
562 };
563
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567         struct ip_rt_acct *dst, *src;
568         unsigned int i, j;
569
570         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571         if (!dst)
572                 return -ENOMEM;
573
574         for_each_possible_cpu(i) {
575                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576                 for (j = 0; j < 256; j++) {
577                         dst[j].o_bytes   += src[j].o_bytes;
578                         dst[j].o_packets += src[j].o_packets;
579                         dst[j].i_bytes   += src[j].i_bytes;
580                         dst[j].i_packets += src[j].i_packets;
581                 }
582         }
583
584         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585         kfree(dst);
586         return 0;
587 }
588
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591         return single_open(file, rt_acct_proc_show, NULL);
592 }
593
594 static const struct file_operations rt_acct_proc_fops = {
595         .owner          = THIS_MODULE,
596         .open           = rt_acct_proc_open,
597         .read           = seq_read,
598         .llseek         = seq_lseek,
599         .release        = single_release,
600 };
601 #endif
602
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605         struct proc_dir_entry *pde;
606
607         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608                         &rt_cache_seq_fops);
609         if (!pde)
610                 goto err1;
611
612         pde = proc_create("rt_cache", S_IRUGO,
613                           net->proc_net_stat, &rt_cpu_seq_fops);
614         if (!pde)
615                 goto err2;
616
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619         if (!pde)
620                 goto err3;
621 #endif
622         return 0;
623
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626         remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629         remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631         return -ENOMEM;
632 }
633
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636         remove_proc_entry("rt_cache", net->proc_net_stat);
637         remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639         remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644         .init = ip_rt_do_proc_init,
645         .exit = ip_rt_do_proc_exit,
646 };
647
648 static int __init ip_rt_proc_init(void)
649 {
650         return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656         return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659
660 static inline void rt_free(struct rtable *rt)
661 {
662         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664
665 static inline void rt_drop(struct rtable *rt)
666 {
667         ip_rt_put(rt);
668         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673         /* Kill broadcast/multicast entries very aggresively, if they
674            collide in hash table with more useful entries */
675         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676                 rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678
679 static inline int rt_valuable(struct rtable *rth)
680 {
681         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682                 (rth->peer && rth->peer->pmtu_expires);
683 }
684
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687         unsigned long age;
688         int ret = 0;
689
690         if (atomic_read(&rth->dst.__refcnt))
691                 goto out;
692
693         age = jiffies - rth->dst.lastuse;
694         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695             (age <= tmo2 && rt_valuable(rth)))
696                 goto out;
697         ret = 1;
698 out:    return ret;
699 }
700
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708         u32 score = jiffies - rt->dst.lastuse;
709
710         score = ~score & ~(3<<30);
711
712         if (rt_valuable(rt))
713                 score |= (1<<31);
714
715         if (rt_is_output_route(rt) ||
716             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717                 score |= (1<<30);
718
719         return score;
720 }
721
722 static inline bool rt_caching(const struct net *net)
723 {
724         return net->ipv4.current_rt_cache_rebuild_count <=
725                 net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729                                        const struct rtable *rt2)
730 {
731         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740                 (rt1->rt_mark ^ rt2->rt_mark) |
741                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
742                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
743                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763         unsigned int i;
764         struct rtable *rth, *next;
765
766         for (i = 0; i <= rt_hash_mask; i++) {
767                 struct rtable __rcu **pprev;
768                 struct rtable *list;
769
770                 if (process_context && need_resched())
771                         cond_resched();
772                 rth = rcu_access_pointer(rt_hash_table[i].chain);
773                 if (!rth)
774                         continue;
775
776                 spin_lock_bh(rt_hash_lock_addr(i));
777
778                 list = NULL;
779                 pprev = &rt_hash_table[i].chain;
780                 rth = rcu_dereference_protected(*pprev,
781                         lockdep_is_held(rt_hash_lock_addr(i)));
782
783                 while (rth) {
784                         next = rcu_dereference_protected(rth->dst.rt_next,
785                                 lockdep_is_held(rt_hash_lock_addr(i)));
786
787                         if (!net ||
788                             net_eq(dev_net(rth->dst.dev), net)) {
789                                 rcu_assign_pointer(*pprev, next);
790                                 rcu_assign_pointer(rth->dst.rt_next, list);
791                                 list = rth;
792                         } else {
793                                 pprev = &rth->dst.rt_next;
794                         }
795                         rth = next;
796                 }
797
798                 spin_unlock_bh(rt_hash_lock_addr(i));
799
800                 for (; list; list = next) {
801                         next = rcu_dereference_protected(list->dst.rt_next, 1);
802                         rt_free(list);
803                 }
804         }
805 }
806
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827         const struct rtable *aux = head;
828
829         while (aux != rth) {
830                 if (compare_hash_inputs(aux, rth))
831                         return 0;
832                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833         }
834         return ONE;
835 }
836
837 static void rt_check_expire(void)
838 {
839         static unsigned int rover;
840         unsigned int i = rover, goal;
841         struct rtable *rth;
842         struct rtable __rcu **rthp;
843         unsigned long samples = 0;
844         unsigned long sum = 0, sum2 = 0;
845         unsigned long delta;
846         u64 mult;
847
848         delta = jiffies - expires_ljiffies;
849         expires_ljiffies = jiffies;
850         mult = ((u64)delta) << rt_hash_log;
851         if (ip_rt_gc_timeout > 1)
852                 do_div(mult, ip_rt_gc_timeout);
853         goal = (unsigned int)mult;
854         if (goal > rt_hash_mask)
855                 goal = rt_hash_mask + 1;
856         for (; goal > 0; goal--) {
857                 unsigned long tmo = ip_rt_gc_timeout;
858                 unsigned long length;
859
860                 i = (i + 1) & rt_hash_mask;
861                 rthp = &rt_hash_table[i].chain;
862
863                 if (need_resched())
864                         cond_resched();
865
866                 samples++;
867
868                 if (rcu_dereference_raw(*rthp) == NULL)
869                         continue;
870                 length = 0;
871                 spin_lock_bh(rt_hash_lock_addr(i));
872                 while ((rth = rcu_dereference_protected(*rthp,
873                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874                         prefetch(rth->dst.rt_next);
875                         if (rt_is_expired(rth)) {
876                                 *rthp = rth->dst.rt_next;
877                                 rt_free(rth);
878                                 continue;
879                         }
880                         if (rth->dst.expires) {
881                                 /* Entry is expired even if it is in use */
882                                 if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884                                         tmo >>= 1;
885                                         rthp = &rth->dst.rt_next;
886                                         /*
887                                          * We only count entries on
888                                          * a chain with equal hash inputs once
889                                          * so that entries for different QOS
890                                          * levels, and other non-hash input
891                                          * attributes don't unfairly skew
892                                          * the length computation
893                                          */
894                                         length += has_noalias(rt_hash_table[i].chain, rth);
895                                         continue;
896                                 }
897                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898                                 goto nofree;
899
900                         /* Cleanup aged off entries. */
901                         *rthp = rth->dst.rt_next;
902                         rt_free(rth);
903                 }
904                 spin_unlock_bh(rt_hash_lock_addr(i));
905                 sum += length;
906                 sum2 += length*length;
907         }
908         if (samples) {
909                 unsigned long avg = sum / samples;
910                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911                 rt_chain_length_max = max_t(unsigned long,
912                                         ip_rt_gc_elasticity,
913                                         (avg + 4*sd) >> FRACT_BITS);
914         }
915         rover = i;
916 }
917
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
922 static void rt_worker_func(struct work_struct *work)
923 {
924         rt_check_expire();
925         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
934 static void rt_cache_invalidate(struct net *net)
935 {
936         unsigned char shuffle;
937
938         get_random_bytes(&shuffle, sizeof(shuffle));
939         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940         inetpeer_invalidate_tree(AF_INET);
941 }
942
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949         rt_cache_invalidate(net);
950         if (delay >= 0)
951                 rt_do_flush(net, !in_softirq());
952 }
953
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957         rt_do_flush(net, !in_softirq());
958 }
959
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962         if (net_ratelimit())
963                 pr_warn("Route hash chain too long!\n");
964         rt_cache_invalidate(net);
965 }
966
967 /*
968    Short description of GC goals.
969
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982         static unsigned long expire = RT_GC_TIMEOUT;
983         static unsigned long last_gc;
984         static int rover;
985         static int equilibrium;
986         struct rtable *rth;
987         struct rtable __rcu **rthp;
988         unsigned long now = jiffies;
989         int goal;
990         int entries = dst_entries_get_fast(&ipv4_dst_ops);
991
992         /*
993          * Garbage collection is pretty expensive,
994          * do not make it too frequently.
995          */
996
997         RT_CACHE_STAT_INC(gc_total);
998
999         if (now - last_gc < ip_rt_gc_min_interval &&
1000             entries < ip_rt_max_size) {
1001                 RT_CACHE_STAT_INC(gc_ignored);
1002                 goto out;
1003         }
1004
1005         entries = dst_entries_get_slow(&ipv4_dst_ops);
1006         /* Calculate number of entries, which we want to expire now. */
1007         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008         if (goal <= 0) {
1009                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010                         equilibrium = ipv4_dst_ops.gc_thresh;
1011                 goal = entries - equilibrium;
1012                 if (goal > 0) {
1013                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014                         goal = entries - equilibrium;
1015                 }
1016         } else {
1017                 /* We are in dangerous area. Try to reduce cache really
1018                  * aggressively.
1019                  */
1020                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021                 equilibrium = entries - goal;
1022         }
1023
1024         if (now - last_gc >= ip_rt_gc_min_interval)
1025                 last_gc = now;
1026
1027         if (goal <= 0) {
1028                 equilibrium += goal;
1029                 goto work_done;
1030         }
1031
1032         do {
1033                 int i, k;
1034
1035                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036                         unsigned long tmo = expire;
1037
1038                         k = (k + 1) & rt_hash_mask;
1039                         rthp = &rt_hash_table[k].chain;
1040                         spin_lock_bh(rt_hash_lock_addr(k));
1041                         while ((rth = rcu_dereference_protected(*rthp,
1042                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043                                 if (!rt_is_expired(rth) &&
1044                                         !rt_may_expire(rth, tmo, expire)) {
1045                                         tmo >>= 1;
1046                                         rthp = &rth->dst.rt_next;
1047                                         continue;
1048                                 }
1049                                 *rthp = rth->dst.rt_next;
1050                                 rt_free(rth);
1051                                 goal--;
1052                         }
1053                         spin_unlock_bh(rt_hash_lock_addr(k));
1054                         if (goal <= 0)
1055                                 break;
1056                 }
1057                 rover = k;
1058
1059                 if (goal <= 0)
1060                         goto work_done;
1061
1062                 /* Goal is not achieved. We stop process if:
1063
1064                    - if expire reduced to zero. Otherwise, expire is halfed.
1065                    - if table is not full.
1066                    - if we are called from interrupt.
1067                    - jiffies check is just fallback/debug loop breaker.
1068                      We will not spin here for long time in any case.
1069                  */
1070
1071                 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073                 if (expire == 0)
1074                         break;
1075
1076                 expire >>= 1;
1077
1078                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079                         goto out;
1080         } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         if (net_ratelimit())
1087                 pr_warn("dst cache overflow\n");
1088         RT_CACHE_STAT_INC(gc_dst_overflow);
1089         return 1;
1090
1091 work_done:
1092         expire += ip_rt_gc_min_interval;
1093         if (expire > ip_rt_gc_timeout ||
1094             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096                 expire = ip_rt_gc_timeout;
1097 out:    return 0;
1098 }
1099
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105         int length = 0;
1106         const struct rtable *rth = head;
1107
1108         while (rth) {
1109                 length += has_noalias(head, rth);
1110                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111         }
1112         return length >> FRACT_BITS;
1113 }
1114
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117         static const __be32 inaddr_any = 0;
1118         struct net_device *dev = dst->dev;
1119         const __be32 *pkey = daddr;
1120         const struct rtable *rt;
1121         struct neighbour *n;
1122
1123         rt = (const struct rtable *) dst;
1124
1125         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1126                 pkey = &inaddr_any;
1127         else if (rt->rt_gateway)
1128                 pkey = (const __be32 *) &rt->rt_gateway;
1129
1130         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1131         if (n)
1132                 return n;
1133         return neigh_create(&arp_tbl, pkey, dev);
1134 }
1135
1136 static int rt_bind_neighbour(struct rtable *rt)
1137 {
1138         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1139         if (IS_ERR(n))
1140                 return PTR_ERR(n);
1141         dst_set_neighbour(&rt->dst, n);
1142
1143         return 0;
1144 }
1145
1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147                                      struct sk_buff *skb, int ifindex)
1148 {
1149         struct rtable   *rth, *cand;
1150         struct rtable __rcu **rthp, **candp;
1151         unsigned long   now;
1152         u32             min_score;
1153         int             chain_length;
1154         int attempts = !in_softirq();
1155
1156 restart:
1157         chain_length = 0;
1158         min_score = ~(u32)0;
1159         cand = NULL;
1160         candp = NULL;
1161         now = jiffies;
1162
1163         if (!rt_caching(dev_net(rt->dst.dev))) {
1164                 /*
1165                  * If we're not caching, just tell the caller we
1166                  * were successful and don't touch the route.  The
1167                  * caller hold the sole reference to the cache entry, and
1168                  * it will be released when the caller is done with it.
1169                  * If we drop it here, the callers have no way to resolve routes
1170                  * when we're not caching.  Instead, just point *rp at rt, so
1171                  * the caller gets a single use out of the route
1172                  * Note that we do rt_free on this new route entry, so that
1173                  * once its refcount hits zero, we are still able to reap it
1174                  * (Thanks Alexey)
1175                  * Note: To avoid expensive rcu stuff for this uncached dst,
1176                  * we set DST_NOCACHE so that dst_release() can free dst without
1177                  * waiting a grace period.
1178                  */
1179
1180                 rt->dst.flags |= DST_NOCACHE;
1181                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182                         int err = rt_bind_neighbour(rt);
1183                         if (err) {
1184                                 if (net_ratelimit())
1185                                         pr_warn("Neighbour table failure & not caching routes\n");
1186                                 ip_rt_put(rt);
1187                                 return ERR_PTR(err);
1188                         }
1189                 }
1190
1191                 goto skip_hashing;
1192         }
1193
1194         rthp = &rt_hash_table[hash].chain;
1195
1196         spin_lock_bh(rt_hash_lock_addr(hash));
1197         while ((rth = rcu_dereference_protected(*rthp,
1198                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1199                 if (rt_is_expired(rth)) {
1200                         *rthp = rth->dst.rt_next;
1201                         rt_free(rth);
1202                         continue;
1203                 }
1204                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1205                         /* Put it first */
1206                         *rthp = rth->dst.rt_next;
1207                         /*
1208                          * Since lookup is lockfree, the deletion
1209                          * must be visible to another weakly ordered CPU before
1210                          * the insertion at the start of the hash chain.
1211                          */
1212                         rcu_assign_pointer(rth->dst.rt_next,
1213                                            rt_hash_table[hash].chain);
1214                         /*
1215                          * Since lookup is lockfree, the update writes
1216                          * must be ordered for consistency on SMP.
1217                          */
1218                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1219
1220                         dst_use(&rth->dst, now);
1221                         spin_unlock_bh(rt_hash_lock_addr(hash));
1222
1223                         rt_drop(rt);
1224                         if (skb)
1225                                 skb_dst_set(skb, &rth->dst);
1226                         return rth;
1227                 }
1228
1229                 if (!atomic_read(&rth->dst.__refcnt)) {
1230                         u32 score = rt_score(rth);
1231
1232                         if (score <= min_score) {
1233                                 cand = rth;
1234                                 candp = rthp;
1235                                 min_score = score;
1236                         }
1237                 }
1238
1239                 chain_length++;
1240
1241                 rthp = &rth->dst.rt_next;
1242         }
1243
1244         if (cand) {
1245                 /* ip_rt_gc_elasticity used to be average length of chain
1246                  * length, when exceeded gc becomes really aggressive.
1247                  *
1248                  * The second limit is less certain. At the moment it allows
1249                  * only 2 entries per bucket. We will see.
1250                  */
1251                 if (chain_length > ip_rt_gc_elasticity) {
1252                         *candp = cand->dst.rt_next;
1253                         rt_free(cand);
1254                 }
1255         } else {
1256                 if (chain_length > rt_chain_length_max &&
1257                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1258                         struct net *net = dev_net(rt->dst.dev);
1259                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1260                         if (!rt_caching(net)) {
1261                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1262                                         rt->dst.dev->name, num);
1263                         }
1264                         rt_emergency_hash_rebuild(net);
1265                         spin_unlock_bh(rt_hash_lock_addr(hash));
1266
1267                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1268                                         ifindex, rt_genid(net));
1269                         goto restart;
1270                 }
1271         }
1272
1273         /* Try to bind route to arp only if it is output
1274            route or unicast forwarding path.
1275          */
1276         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1277                 int err = rt_bind_neighbour(rt);
1278                 if (err) {
1279                         spin_unlock_bh(rt_hash_lock_addr(hash));
1280
1281                         if (err != -ENOBUFS) {
1282                                 rt_drop(rt);
1283                                 return ERR_PTR(err);
1284                         }
1285
1286                         /* Neighbour tables are full and nothing
1287                            can be released. Try to shrink route cache,
1288                            it is most likely it holds some neighbour records.
1289                          */
1290                         if (attempts-- > 0) {
1291                                 int saved_elasticity = ip_rt_gc_elasticity;
1292                                 int saved_int = ip_rt_gc_min_interval;
1293                                 ip_rt_gc_elasticity     = 1;
1294                                 ip_rt_gc_min_interval   = 0;
1295                                 rt_garbage_collect(&ipv4_dst_ops);
1296                                 ip_rt_gc_min_interval   = saved_int;
1297                                 ip_rt_gc_elasticity     = saved_elasticity;
1298                                 goto restart;
1299                         }
1300
1301                         if (net_ratelimit())
1302                                 pr_warn("Neighbour table overflow\n");
1303                         rt_drop(rt);
1304                         return ERR_PTR(-ENOBUFS);
1305                 }
1306         }
1307
1308         rt->dst.rt_next = rt_hash_table[hash].chain;
1309
1310         /*
1311          * Since lookup is lockfree, we must make sure
1312          * previous writes to rt are committed to memory
1313          * before making rt visible to other CPUS.
1314          */
1315         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1316
1317         spin_unlock_bh(rt_hash_lock_addr(hash));
1318
1319 skip_hashing:
1320         if (skb)
1321                 skb_dst_set(skb, &rt->dst);
1322         return rt;
1323 }
1324
1325 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1326
1327 static u32 rt_peer_genid(void)
1328 {
1329         return atomic_read(&__rt_peer_genid);
1330 }
1331
1332 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1333 {
1334         struct inet_peer *peer;
1335
1336         peer = inet_getpeer_v4(daddr, create);
1337
1338         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1339                 inet_putpeer(peer);
1340         else
1341                 rt->rt_peer_genid = rt_peer_genid();
1342 }
1343
1344 /*
1345  * Peer allocation may fail only in serious out-of-memory conditions.  However
1346  * we still can generate some output.
1347  * Random ID selection looks a bit dangerous because we have no chances to
1348  * select ID being unique in a reasonable period of time.
1349  * But broken packet identifier may be better than no packet at all.
1350  */
1351 static void ip_select_fb_ident(struct iphdr *iph)
1352 {
1353         static DEFINE_SPINLOCK(ip_fb_id_lock);
1354         static u32 ip_fallback_id;
1355         u32 salt;
1356
1357         spin_lock_bh(&ip_fb_id_lock);
1358         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1359         iph->id = htons(salt & 0xFFFF);
1360         ip_fallback_id = salt;
1361         spin_unlock_bh(&ip_fb_id_lock);
1362 }
1363
1364 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365 {
1366         struct rtable *rt = (struct rtable *) dst;
1367
1368         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1369                 if (rt->peer == NULL)
1370                         rt_bind_peer(rt, rt->rt_dst, 1);
1371
1372                 /* If peer is attached to destination, it is never detached,
1373                    so that we need not to grab a lock to dereference it.
1374                  */
1375                 if (rt->peer) {
1376                         iph->id = htons(inet_getid(rt->peer, more));
1377                         return;
1378                 }
1379         } else if (!rt)
1380                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1381                        __builtin_return_address(0));
1382
1383         ip_select_fb_ident(iph);
1384 }
1385 EXPORT_SYMBOL(__ip_select_ident);
1386
1387 static void rt_del(unsigned hash, struct rtable *rt)
1388 {
1389         struct rtable __rcu **rthp;
1390         struct rtable *aux;
1391
1392         rthp = &rt_hash_table[hash].chain;
1393         spin_lock_bh(rt_hash_lock_addr(hash));
1394         ip_rt_put(rt);
1395         while ((aux = rcu_dereference_protected(*rthp,
1396                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1397                 if (aux == rt || rt_is_expired(aux)) {
1398                         *rthp = aux->dst.rt_next;
1399                         rt_free(aux);
1400                         continue;
1401                 }
1402                 rthp = &aux->dst.rt_next;
1403         }
1404         spin_unlock_bh(rt_hash_lock_addr(hash));
1405 }
1406
1407 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1408 {
1409         struct rtable *rt = (struct rtable *) dst;
1410         __be32 orig_gw = rt->rt_gateway;
1411         struct neighbour *n, *old_n;
1412
1413         dst_confirm(&rt->dst);
1414
1415         rt->rt_gateway = peer->redirect_learned.a4;
1416
1417         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1418         if (IS_ERR(n)) {
1419                 rt->rt_gateway = orig_gw;
1420                 return;
1421         }
1422         old_n = xchg(&rt->dst._neighbour, n);
1423         if (old_n)
1424                 neigh_release(old_n);
1425         if (!(n->nud_state & NUD_VALID)) {
1426                 neigh_event_send(n, NULL);
1427         } else {
1428                 rt->rt_flags |= RTCF_REDIRECTED;
1429                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1430         }
1431 }
1432
1433 /* called in rcu_read_lock() section */
1434 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1435                     __be32 saddr, struct net_device *dev)
1436 {
1437         int s, i;
1438         struct in_device *in_dev = __in_dev_get_rcu(dev);
1439         __be32 skeys[2] = { saddr, 0 };
1440         int    ikeys[2] = { dev->ifindex, 0 };
1441         struct inet_peer *peer;
1442         struct net *net;
1443
1444         if (!in_dev)
1445                 return;
1446
1447         net = dev_net(dev);
1448         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1449             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1450             ipv4_is_zeronet(new_gw))
1451                 goto reject_redirect;
1452
1453         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1454                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1455                         goto reject_redirect;
1456                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1457                         goto reject_redirect;
1458         } else {
1459                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1460                         goto reject_redirect;
1461         }
1462
1463         for (s = 0; s < 2; s++) {
1464                 for (i = 0; i < 2; i++) {
1465                         unsigned int hash;
1466                         struct rtable __rcu **rthp;
1467                         struct rtable *rt;
1468
1469                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1470
1471                         rthp = &rt_hash_table[hash].chain;
1472
1473                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1474                                 rthp = &rt->dst.rt_next;
1475
1476                                 if (rt->rt_key_dst != daddr ||
1477                                     rt->rt_key_src != skeys[s] ||
1478                                     rt->rt_oif != ikeys[i] ||
1479                                     rt_is_input_route(rt) ||
1480                                     rt_is_expired(rt) ||
1481                                     !net_eq(dev_net(rt->dst.dev), net) ||
1482                                     rt->dst.error ||
1483                                     rt->dst.dev != dev ||
1484                                     rt->rt_gateway != old_gw)
1485                                         continue;
1486
1487                                 if (!rt->peer)
1488                                         rt_bind_peer(rt, rt->rt_dst, 1);
1489
1490                                 peer = rt->peer;
1491                                 if (peer) {
1492                                         if (peer->redirect_learned.a4 != new_gw) {
1493                                                 peer->redirect_learned.a4 = new_gw;
1494                                                 atomic_inc(&__rt_peer_genid);
1495                                         }
1496                                         check_peer_redir(&rt->dst, peer);
1497                                 }
1498                         }
1499                 }
1500         }
1501         return;
1502
1503 reject_redirect:
1504 #ifdef CONFIG_IP_ROUTE_VERBOSE
1505         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1506                 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
1507                         "  Advised path = %pI4 -> %pI4\n",
1508                         &old_gw, dev->name, &new_gw,
1509                         &saddr, &daddr);
1510 #endif
1511         ;
1512 }
1513
1514 static bool peer_pmtu_expired(struct inet_peer *peer)
1515 {
1516         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1517
1518         return orig &&
1519                time_after_eq(jiffies, orig) &&
1520                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1521 }
1522
1523 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1524 {
1525         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1526
1527         return orig &&
1528                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1529 }
1530
1531 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1532 {
1533         struct rtable *rt = (struct rtable *)dst;
1534         struct dst_entry *ret = dst;
1535
1536         if (rt) {
1537                 if (dst->obsolete > 0) {
1538                         ip_rt_put(rt);
1539                         ret = NULL;
1540                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1541                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1542                                                 rt->rt_oif,
1543                                                 rt_genid(dev_net(dst->dev)));
1544                         rt_del(hash, rt);
1545                         ret = NULL;
1546                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1547                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1548                 }
1549         }
1550         return ret;
1551 }
1552
1553 /*
1554  * Algorithm:
1555  *      1. The first ip_rt_redirect_number redirects are sent
1556  *         with exponential backoff, then we stop sending them at all,
1557  *         assuming that the host ignores our redirects.
1558  *      2. If we did not see packets requiring redirects
1559  *         during ip_rt_redirect_silence, we assume that the host
1560  *         forgot redirected route and start to send redirects again.
1561  *
1562  * This algorithm is much cheaper and more intelligent than dumb load limiting
1563  * in icmp.c.
1564  *
1565  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1566  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1567  */
1568
1569 void ip_rt_send_redirect(struct sk_buff *skb)
1570 {
1571         struct rtable *rt = skb_rtable(skb);
1572         struct in_device *in_dev;
1573         struct inet_peer *peer;
1574         int log_martians;
1575
1576         rcu_read_lock();
1577         in_dev = __in_dev_get_rcu(rt->dst.dev);
1578         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1579                 rcu_read_unlock();
1580                 return;
1581         }
1582         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1583         rcu_read_unlock();
1584
1585         if (!rt->peer)
1586                 rt_bind_peer(rt, rt->rt_dst, 1);
1587         peer = rt->peer;
1588         if (!peer) {
1589                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1590                 return;
1591         }
1592
1593         /* No redirected packets during ip_rt_redirect_silence;
1594          * reset the algorithm.
1595          */
1596         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1597                 peer->rate_tokens = 0;
1598
1599         /* Too many ignored redirects; do not send anything
1600          * set dst.rate_last to the last seen redirected packet.
1601          */
1602         if (peer->rate_tokens >= ip_rt_redirect_number) {
1603                 peer->rate_last = jiffies;
1604                 return;
1605         }
1606
1607         /* Check for load limit; set rate_last to the latest sent
1608          * redirect.
1609          */
1610         if (peer->rate_tokens == 0 ||
1611             time_after(jiffies,
1612                        (peer->rate_last +
1613                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1614                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1615                 peer->rate_last = jiffies;
1616                 ++peer->rate_tokens;
1617 #ifdef CONFIG_IP_ROUTE_VERBOSE
1618                 if (log_martians &&
1619                     peer->rate_tokens == ip_rt_redirect_number &&
1620                     net_ratelimit())
1621                         pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1622                                 &ip_hdr(skb)->saddr, rt->rt_iif,
1623                                 &rt->rt_dst, &rt->rt_gateway);
1624 #endif
1625         }
1626 }
1627
1628 static int ip_error(struct sk_buff *skb)
1629 {
1630         struct rtable *rt = skb_rtable(skb);
1631         struct inet_peer *peer;
1632         unsigned long now;
1633         bool send;
1634         int code;
1635
1636         switch (rt->dst.error) {
1637         case EINVAL:
1638         default:
1639                 goto out;
1640         case EHOSTUNREACH:
1641                 code = ICMP_HOST_UNREACH;
1642                 break;
1643         case ENETUNREACH:
1644                 code = ICMP_NET_UNREACH;
1645                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1646                                 IPSTATS_MIB_INNOROUTES);
1647                 break;
1648         case EACCES:
1649                 code = ICMP_PKT_FILTERED;
1650                 break;
1651         }
1652
1653         if (!rt->peer)
1654                 rt_bind_peer(rt, rt->rt_dst, 1);
1655         peer = rt->peer;
1656
1657         send = true;
1658         if (peer) {
1659                 now = jiffies;
1660                 peer->rate_tokens += now - peer->rate_last;
1661                 if (peer->rate_tokens > ip_rt_error_burst)
1662                         peer->rate_tokens = ip_rt_error_burst;
1663                 peer->rate_last = now;
1664                 if (peer->rate_tokens >= ip_rt_error_cost)
1665                         peer->rate_tokens -= ip_rt_error_cost;
1666                 else
1667                         send = false;
1668         }
1669         if (send)
1670                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1671
1672 out:    kfree_skb(skb);
1673         return 0;
1674 }
1675
1676 /*
1677  *      The last two values are not from the RFC but
1678  *      are needed for AMPRnet AX.25 paths.
1679  */
1680
1681 static const unsigned short mtu_plateau[] =
1682 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1683
1684 static inline unsigned short guess_mtu(unsigned short old_mtu)
1685 {
1686         int i;
1687
1688         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1689                 if (old_mtu > mtu_plateau[i])
1690                         return mtu_plateau[i];
1691         return 68;
1692 }
1693
1694 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1695                                  unsigned short new_mtu,
1696                                  struct net_device *dev)
1697 {
1698         unsigned short old_mtu = ntohs(iph->tot_len);
1699         unsigned short est_mtu = 0;
1700         struct inet_peer *peer;
1701
1702         peer = inet_getpeer_v4(iph->daddr, 1);
1703         if (peer) {
1704                 unsigned short mtu = new_mtu;
1705
1706                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1707                         /* BSD 4.2 derived systems incorrectly adjust
1708                          * tot_len by the IP header length, and report
1709                          * a zero MTU in the ICMP message.
1710                          */
1711                         if (mtu == 0 &&
1712                             old_mtu >= 68 + (iph->ihl << 2))
1713                                 old_mtu -= iph->ihl << 2;
1714                         mtu = guess_mtu(old_mtu);
1715                 }
1716
1717                 if (mtu < ip_rt_min_pmtu)
1718                         mtu = ip_rt_min_pmtu;
1719                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1720                         unsigned long pmtu_expires;
1721
1722                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1723                         if (!pmtu_expires)
1724                                 pmtu_expires = 1UL;
1725
1726                         est_mtu = mtu;
1727                         peer->pmtu_learned = mtu;
1728                         peer->pmtu_expires = pmtu_expires;
1729                         atomic_inc(&__rt_peer_genid);
1730                 }
1731
1732                 inet_putpeer(peer);
1733         }
1734         return est_mtu ? : new_mtu;
1735 }
1736
1737 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1738 {
1739         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1740
1741         if (!expires)
1742                 return;
1743         if (time_before(jiffies, expires)) {
1744                 u32 orig_dst_mtu = dst_mtu(dst);
1745                 if (peer->pmtu_learned < orig_dst_mtu) {
1746                         if (!peer->pmtu_orig)
1747                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1748                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1749                 }
1750         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1751                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1752 }
1753
1754 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1755 {
1756         struct rtable *rt = (struct rtable *) dst;
1757         struct inet_peer *peer;
1758
1759         dst_confirm(dst);
1760
1761         if (!rt->peer)
1762                 rt_bind_peer(rt, rt->rt_dst, 1);
1763         peer = rt->peer;
1764         if (peer) {
1765                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1766
1767                 if (mtu < ip_rt_min_pmtu)
1768                         mtu = ip_rt_min_pmtu;
1769                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1770
1771                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1772                         if (!pmtu_expires)
1773                                 pmtu_expires = 1UL;
1774
1775                         peer->pmtu_learned = mtu;
1776                         peer->pmtu_expires = pmtu_expires;
1777
1778                         atomic_inc(&__rt_peer_genid);
1779                         rt->rt_peer_genid = rt_peer_genid();
1780                 }
1781                 check_peer_pmtu(dst, peer);
1782         }
1783 }
1784
1785
1786 static void ipv4_validate_peer(struct rtable *rt)
1787 {
1788         if (rt->rt_peer_genid != rt_peer_genid()) {
1789                 struct inet_peer *peer;
1790
1791                 if (!rt->peer)
1792                         rt_bind_peer(rt, rt->rt_dst, 0);
1793
1794                 peer = rt->peer;
1795                 if (peer) {
1796                         check_peer_pmtu(&rt->dst, peer);
1797
1798                         if (peer->redirect_learned.a4 &&
1799                             peer->redirect_learned.a4 != rt->rt_gateway)
1800                                 check_peer_redir(&rt->dst, peer);
1801                 }
1802
1803                 rt->rt_peer_genid = rt_peer_genid();
1804         }
1805 }
1806
1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809         struct rtable *rt = (struct rtable *) dst;
1810
1811         if (rt_is_expired(rt))
1812                 return NULL;
1813         ipv4_validate_peer(rt);
1814         return dst;
1815 }
1816
1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819         struct rtable *rt = (struct rtable *) dst;
1820         struct inet_peer *peer = rt->peer;
1821
1822         if (rt->fi) {
1823                 fib_info_put(rt->fi);
1824                 rt->fi = NULL;
1825         }
1826         if (peer) {
1827                 rt->peer = NULL;
1828                 inet_putpeer(peer);
1829         }
1830 }
1831
1832
1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835         struct rtable *rt;
1836
1837         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
1839         rt = skb_rtable(skb);
1840         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843
1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848                 skb->dev ? skb->dev->name : "?");
1849         kfree_skb(skb);
1850         WARN_ON(1);
1851         return 0;
1852 }
1853
1854 /*
1855    We do not cache source address of outgoing interface,
1856    because it is used only by IP RR, TS and SRR options,
1857    so that it out of fast path.
1858
1859    BTW remember: "addr" is allowed to be not aligned
1860    in IP options!
1861  */
1862
1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865         __be32 src;
1866
1867         if (rt_is_output_route(rt))
1868                 src = ip_hdr(skb)->saddr;
1869         else {
1870                 struct fib_result res;
1871                 struct flowi4 fl4;
1872                 struct iphdr *iph;
1873
1874                 iph = ip_hdr(skb);
1875
1876                 memset(&fl4, 0, sizeof(fl4));
1877                 fl4.daddr = iph->daddr;
1878                 fl4.saddr = iph->saddr;
1879                 fl4.flowi4_tos = RT_TOS(iph->tos);
1880                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881                 fl4.flowi4_iif = skb->dev->ifindex;
1882                 fl4.flowi4_mark = skb->mark;
1883
1884                 rcu_read_lock();
1885                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887                 else
1888                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889                                         RT_SCOPE_UNIVERSE);
1890                 rcu_read_unlock();
1891         }
1892         memcpy(addr, &src, 4);
1893 }
1894
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898         if (!(rt->dst.tclassid & 0xFFFF))
1899                 rt->dst.tclassid |= tag & 0xFFFF;
1900         if (!(rt->dst.tclassid & 0xFFFF0000))
1901                 rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904
1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909         if (advmss == 0) {
1910                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911                                ip_rt_min_advmss);
1912                 if (advmss > 65535 - 40)
1913                         advmss = 65535 - 40;
1914         }
1915         return advmss;
1916 }
1917
1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920         const struct rtable *rt = (const struct rtable *) dst;
1921         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
1923         if (mtu && rt_is_output_route(rt))
1924                 return mtu;
1925
1926         mtu = dst->dev->mtu;
1927
1928         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929
1930                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931                         mtu = 576;
1932         }
1933
1934         if (mtu > IP_MAX_MTU)
1935                 mtu = IP_MAX_MTU;
1936
1937         return mtu;
1938 }
1939
1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941                             struct fib_info *fi)
1942 {
1943         struct inet_peer *peer;
1944         int create = 0;
1945
1946         /* If a peer entry exists for this destination, we must hook
1947          * it up in order to get at cached metrics.
1948          */
1949         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950                 create = 1;
1951
1952         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953         if (peer) {
1954                 rt->rt_peer_genid = rt_peer_genid();
1955                 if (inet_metrics_new(peer))
1956                         memcpy(peer->metrics, fi->fib_metrics,
1957                                sizeof(u32) * RTAX_MAX);
1958                 dst_init_metrics(&rt->dst, peer->metrics, false);
1959
1960                 check_peer_pmtu(&rt->dst, peer);
1961
1962                 if (peer->redirect_learned.a4 &&
1963                     peer->redirect_learned.a4 != rt->rt_gateway) {
1964                         rt->rt_gateway = peer->redirect_learned.a4;
1965                         rt->rt_flags |= RTCF_REDIRECTED;
1966                 }
1967         } else {
1968                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1969                         rt->fi = fi;
1970                         atomic_inc(&fi->fib_clntref);
1971                 }
1972                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1973         }
1974 }
1975
1976 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1977                            const struct fib_result *res,
1978                            struct fib_info *fi, u16 type, u32 itag)
1979 {
1980         struct dst_entry *dst = &rt->dst;
1981
1982         if (fi) {
1983                 if (FIB_RES_GW(*res) &&
1984                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1985                         rt->rt_gateway = FIB_RES_GW(*res);
1986                 rt_init_metrics(rt, fl4, fi);
1987 #ifdef CONFIG_IP_ROUTE_CLASSID
1988                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1989 #endif
1990         }
1991
1992         if (dst_mtu(dst) > IP_MAX_MTU)
1993                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1994         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1995                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1996
1997 #ifdef CONFIG_IP_ROUTE_CLASSID
1998 #ifdef CONFIG_IP_MULTIPLE_TABLES
1999         set_class_tag(rt, fib_rules_tclass(res));
2000 #endif
2001         set_class_tag(rt, itag);
2002 #endif
2003 }
2004
2005 static struct rtable *rt_dst_alloc(struct net_device *dev,
2006                                    bool nopolicy, bool noxfrm)
2007 {
2008         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2009                          DST_HOST |
2010                          (nopolicy ? DST_NOPOLICY : 0) |
2011                          (noxfrm ? DST_NOXFRM : 0));
2012 }
2013
2014 /* called in rcu_read_lock() section */
2015 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2016                                 u8 tos, struct net_device *dev, int our)
2017 {
2018         unsigned int hash;
2019         struct rtable *rth;
2020         __be32 spec_dst;
2021         struct in_device *in_dev = __in_dev_get_rcu(dev);
2022         u32 itag = 0;
2023         int err;
2024
2025         /* Primary sanity checks. */
2026
2027         if (in_dev == NULL)
2028                 return -EINVAL;
2029
2030         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2031             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2032                 goto e_inval;
2033
2034         if (ipv4_is_zeronet(saddr)) {
2035                 if (!ipv4_is_local_multicast(daddr))
2036                         goto e_inval;
2037                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2038         } else {
2039                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2040                                           &itag);
2041                 if (err < 0)
2042                         goto e_err;
2043         }
2044         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2045                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2046         if (!rth)
2047                 goto e_nobufs;
2048
2049 #ifdef CONFIG_IP_ROUTE_CLASSID
2050         rth->dst.tclassid = itag;
2051 #endif
2052         rth->dst.output = ip_rt_bug;
2053
2054         rth->rt_key_dst = daddr;
2055         rth->rt_key_src = saddr;
2056         rth->rt_genid   = rt_genid(dev_net(dev));
2057         rth->rt_flags   = RTCF_MULTICAST;
2058         rth->rt_type    = RTN_MULTICAST;
2059         rth->rt_key_tos = tos;
2060         rth->rt_dst     = daddr;
2061         rth->rt_src     = saddr;
2062         rth->rt_route_iif = dev->ifindex;
2063         rth->rt_iif     = dev->ifindex;
2064         rth->rt_oif     = 0;
2065         rth->rt_mark    = skb->mark;
2066         rth->rt_gateway = daddr;
2067         rth->rt_spec_dst= spec_dst;
2068         rth->rt_peer_genid = 0;
2069         rth->peer = NULL;
2070         rth->fi = NULL;
2071         if (our) {
2072                 rth->dst.input= ip_local_deliver;
2073                 rth->rt_flags |= RTCF_LOCAL;
2074         }
2075
2076 #ifdef CONFIG_IP_MROUTE
2077         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2078                 rth->dst.input = ip_mr_input;
2079 #endif
2080         RT_CACHE_STAT_INC(in_slow_mc);
2081
2082         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2083         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2084         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2085
2086 e_nobufs:
2087         return -ENOBUFS;
2088 e_inval:
2089         return -EINVAL;
2090 e_err:
2091         return err;
2092 }
2093
2094
2095 static void ip_handle_martian_source(struct net_device *dev,
2096                                      struct in_device *in_dev,
2097                                      struct sk_buff *skb,
2098                                      __be32 daddr,
2099                                      __be32 saddr)
2100 {
2101         RT_CACHE_STAT_INC(in_martian_src);
2102 #ifdef CONFIG_IP_ROUTE_VERBOSE
2103         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2104                 /*
2105                  *      RFC1812 recommendation, if source is martian,
2106                  *      the only hint is MAC header.
2107                  */
2108                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2109                         &daddr, &saddr, dev->name);
2110                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2111                         print_hex_dump(KERN_WARNING, "ll header: ",
2112                                        DUMP_PREFIX_OFFSET, 16, 1,
2113                                        skb_mac_header(skb),
2114                                        dev->hard_header_len, true);
2115                 }
2116         }
2117 #endif
2118 }
2119
2120 /* called in rcu_read_lock() section */
2121 static int __mkroute_input(struct sk_buff *skb,
2122                            const struct fib_result *res,
2123                            struct in_device *in_dev,
2124                            __be32 daddr, __be32 saddr, u32 tos,
2125                            struct rtable **result)
2126 {
2127         struct rtable *rth;
2128         int err;
2129         struct in_device *out_dev;
2130         unsigned int flags = 0;
2131         __be32 spec_dst;
2132         u32 itag;
2133
2134         /* get a working reference to the output device */
2135         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136         if (out_dev == NULL) {
2137                 if (net_ratelimit())
2138                         pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2139                 return -EINVAL;
2140         }
2141
2142
2143         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2144                                   in_dev->dev, &spec_dst, &itag);
2145         if (err < 0) {
2146                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2147                                          saddr);
2148
2149                 goto cleanup;
2150         }
2151
2152         if (err)
2153                 flags |= RTCF_DIRECTSRC;
2154
2155         if (out_dev == in_dev && err &&
2156             (IN_DEV_SHARED_MEDIA(out_dev) ||
2157              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2158                 flags |= RTCF_DOREDIRECT;
2159
2160         if (skb->protocol != htons(ETH_P_IP)) {
2161                 /* Not IP (i.e. ARP). Do not create route, if it is
2162                  * invalid for proxy arp. DNAT routes are always valid.
2163                  *
2164                  * Proxy arp feature have been extended to allow, ARP
2165                  * replies back to the same interface, to support
2166                  * Private VLAN switch technologies. See arp.c.
2167                  */
2168                 if (out_dev == in_dev &&
2169                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2170                         err = -EINVAL;
2171                         goto cleanup;
2172                 }
2173         }
2174
2175         rth = rt_dst_alloc(out_dev->dev,
2176                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2177                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2178         if (!rth) {
2179                 err = -ENOBUFS;
2180                 goto cleanup;
2181         }
2182
2183         rth->rt_key_dst = daddr;
2184         rth->rt_key_src = saddr;
2185         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2186         rth->rt_flags = flags;
2187         rth->rt_type = res->type;
2188         rth->rt_key_tos = tos;
2189         rth->rt_dst     = daddr;
2190         rth->rt_src     = saddr;
2191         rth->rt_route_iif = in_dev->dev->ifindex;
2192         rth->rt_iif     = in_dev->dev->ifindex;
2193         rth->rt_oif     = 0;
2194         rth->rt_mark    = skb->mark;
2195         rth->rt_gateway = daddr;
2196         rth->rt_spec_dst= spec_dst;
2197         rth->rt_peer_genid = 0;
2198         rth->peer = NULL;
2199         rth->fi = NULL;
2200
2201         rth->dst.input = ip_forward;
2202         rth->dst.output = ip_output;
2203
2204         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2205
2206         *result = rth;
2207         err = 0;
2208  cleanup:
2209         return err;
2210 }
2211
2212 static int ip_mkroute_input(struct sk_buff *skb,
2213                             struct fib_result *res,
2214                             const struct flowi4 *fl4,
2215                             struct in_device *in_dev,
2216                             __be32 daddr, __be32 saddr, u32 tos)
2217 {
2218         struct rtable* rth = NULL;
2219         int err;
2220         unsigned hash;
2221
2222 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2223         if (res->fi && res->fi->fib_nhs > 1)
2224                 fib_select_multipath(res);
2225 #endif
2226
2227         /* create a routing cache entry */
2228         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2229         if (err)
2230                 return err;
2231
2232         /* put it into the cache */
2233         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2234                        rt_genid(dev_net(rth->dst.dev)));
2235         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2236         if (IS_ERR(rth))
2237                 return PTR_ERR(rth);
2238         return 0;
2239 }
2240
2241 /*
2242  *      NOTE. We drop all the packets that has local source
2243  *      addresses, because every properly looped back packet
2244  *      must have correct destination already attached by output routine.
2245  *
2246  *      Such approach solves two big problems:
2247  *      1. Not simplex devices are handled properly.
2248  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2249  *      called with rcu_read_lock()
2250  */
2251
2252 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2253                                u8 tos, struct net_device *dev)
2254 {
2255         struct fib_result res;
2256         struct in_device *in_dev = __in_dev_get_rcu(dev);
2257         struct flowi4   fl4;
2258         unsigned        flags = 0;
2259         u32             itag = 0;
2260         struct rtable * rth;
2261         unsigned        hash;
2262         __be32          spec_dst;
2263         int             err = -EINVAL;
2264         struct net    * net = dev_net(dev);
2265
2266         /* IP on this device is disabled. */
2267
2268         if (!in_dev)
2269                 goto out;
2270
2271         /* Check for the most weird martians, which can be not detected
2272            by fib_lookup.
2273          */
2274
2275         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2276             ipv4_is_loopback(saddr))
2277                 goto martian_source;
2278
2279         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2280                 goto brd_input;
2281
2282         /* Accept zero addresses only to limited broadcast;
2283          * I even do not know to fix it or not. Waiting for complains :-)
2284          */
2285         if (ipv4_is_zeronet(saddr))
2286                 goto martian_source;
2287
2288         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2289                 goto martian_destination;
2290
2291         /*
2292          *      Now we are ready to route packet.
2293          */
2294         fl4.flowi4_oif = 0;
2295         fl4.flowi4_iif = dev->ifindex;
2296         fl4.flowi4_mark = skb->mark;
2297         fl4.flowi4_tos = tos;
2298         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2299         fl4.daddr = daddr;
2300         fl4.saddr = saddr;
2301         err = fib_lookup(net, &fl4, &res);
2302         if (err != 0) {
2303                 if (!IN_DEV_FORWARD(in_dev))
2304                         goto e_hostunreach;
2305                 goto no_route;
2306         }
2307
2308         RT_CACHE_STAT_INC(in_slow_tot);
2309
2310         if (res.type == RTN_BROADCAST)
2311                 goto brd_input;
2312
2313         if (res.type == RTN_LOCAL) {
2314                 err = fib_validate_source(skb, saddr, daddr, tos,
2315                                           net->loopback_dev->ifindex,
2316                                           dev, &spec_dst, &itag);
2317                 if (err < 0)
2318                         goto martian_source_keep_err;
2319                 if (err)
2320                         flags |= RTCF_DIRECTSRC;
2321                 spec_dst = daddr;
2322                 goto local_input;
2323         }
2324
2325         if (!IN_DEV_FORWARD(in_dev))
2326                 goto e_hostunreach;
2327         if (res.type != RTN_UNICAST)
2328                 goto martian_destination;
2329
2330         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2331 out:    return err;
2332
2333 brd_input:
2334         if (skb->protocol != htons(ETH_P_IP))
2335                 goto e_inval;
2336
2337         if (ipv4_is_zeronet(saddr))
2338                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2339         else {
2340                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2341                                           &itag);
2342                 if (err < 0)
2343                         goto martian_source_keep_err;
2344                 if (err)
2345                         flags |= RTCF_DIRECTSRC;
2346         }
2347         flags |= RTCF_BROADCAST;
2348         res.type = RTN_BROADCAST;
2349         RT_CACHE_STAT_INC(in_brd);
2350
2351 local_input:
2352         rth = rt_dst_alloc(net->loopback_dev,
2353                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2354         if (!rth)
2355                 goto e_nobufs;
2356
2357         rth->dst.input= ip_local_deliver;
2358         rth->dst.output= ip_rt_bug;
2359 #ifdef CONFIG_IP_ROUTE_CLASSID
2360         rth->dst.tclassid = itag;
2361 #endif
2362
2363         rth->rt_key_dst = daddr;
2364         rth->rt_key_src = saddr;
2365         rth->rt_genid = rt_genid(net);
2366         rth->rt_flags   = flags|RTCF_LOCAL;
2367         rth->rt_type    = res.type;
2368         rth->rt_key_tos = tos;
2369         rth->rt_dst     = daddr;
2370         rth->rt_src     = saddr;
2371 #ifdef CONFIG_IP_ROUTE_CLASSID
2372         rth->dst.tclassid = itag;
2373 #endif
2374         rth->rt_route_iif = dev->ifindex;
2375         rth->rt_iif     = dev->ifindex;
2376         rth->rt_oif     = 0;
2377         rth->rt_mark    = skb->mark;
2378         rth->rt_gateway = daddr;
2379         rth->rt_spec_dst= spec_dst;
2380         rth->rt_peer_genid = 0;
2381         rth->peer = NULL;
2382         rth->fi = NULL;
2383         if (res.type == RTN_UNREACHABLE) {
2384                 rth->dst.input= ip_error;
2385                 rth->dst.error= -err;
2386                 rth->rt_flags   &= ~RTCF_LOCAL;
2387         }
2388         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2389         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2390         err = 0;
2391         if (IS_ERR(rth))
2392                 err = PTR_ERR(rth);
2393         goto out;
2394
2395 no_route:
2396         RT_CACHE_STAT_INC(in_no_route);
2397         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2398         res.type = RTN_UNREACHABLE;
2399         if (err == -ESRCH)
2400                 err = -ENETUNREACH;
2401         goto local_input;
2402
2403         /*
2404          *      Do not cache martian addresses: they should be logged (RFC1812)
2405          */
2406 martian_destination:
2407         RT_CACHE_STAT_INC(in_martian_dst);
2408 #ifdef CONFIG_IP_ROUTE_VERBOSE
2409         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2410                 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
2411                         &daddr, &saddr, dev->name);
2412 #endif
2413
2414 e_hostunreach:
2415         err = -EHOSTUNREACH;
2416         goto out;
2417
2418 e_inval:
2419         err = -EINVAL;
2420         goto out;
2421
2422 e_nobufs:
2423         err = -ENOBUFS;
2424         goto out;
2425
2426 martian_source:
2427         err = -EINVAL;
2428 martian_source_keep_err:
2429         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2430         goto out;
2431 }
2432
2433 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2434                            u8 tos, struct net_device *dev, bool noref)
2435 {
2436         struct rtable * rth;
2437         unsigned        hash;
2438         int iif = dev->ifindex;
2439         struct net *net;
2440         int res;
2441
2442         net = dev_net(dev);
2443
2444         rcu_read_lock();
2445
2446         if (!rt_caching(net))
2447                 goto skip_cache;
2448
2449         tos &= IPTOS_RT_MASK;
2450         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2451
2452         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2453              rth = rcu_dereference(rth->dst.rt_next)) {
2454                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2455                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2456                      (rth->rt_route_iif ^ iif) |
2457                      (rth->rt_key_tos ^ tos)) == 0 &&
2458                     rth->rt_mark == skb->mark &&
2459                     net_eq(dev_net(rth->dst.dev), net) &&
2460                     !rt_is_expired(rth)) {
2461                         ipv4_validate_peer(rth);
2462                         if (noref) {
2463                                 dst_use_noref(&rth->dst, jiffies);
2464                                 skb_dst_set_noref(skb, &rth->dst);
2465                         } else {
2466                                 dst_use(&rth->dst, jiffies);
2467                                 skb_dst_set(skb, &rth->dst);
2468                         }
2469                         RT_CACHE_STAT_INC(in_hit);
2470                         rcu_read_unlock();
2471                         return 0;
2472                 }
2473                 RT_CACHE_STAT_INC(in_hlist_search);
2474         }
2475
2476 skip_cache:
2477         /* Multicast recognition logic is moved from route cache to here.
2478            The problem was that too many Ethernet cards have broken/missing
2479            hardware multicast filters :-( As result the host on multicasting
2480            network acquires a lot of useless route cache entries, sort of
2481            SDR messages from all the world. Now we try to get rid of them.
2482            Really, provided software IP multicast filter is organized
2483            reasonably (at least, hashed), it does not result in a slowdown
2484            comparing with route cache reject entries.
2485            Note, that multicast routers are not affected, because
2486            route cache entry is created eventually.
2487          */
2488         if (ipv4_is_multicast(daddr)) {
2489                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2490
2491                 if (in_dev) {
2492                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2493                                                   ip_hdr(skb)->protocol);
2494                         if (our
2495 #ifdef CONFIG_IP_MROUTE
2496                                 ||
2497                             (!ipv4_is_local_multicast(daddr) &&
2498                              IN_DEV_MFORWARD(in_dev))
2499 #endif
2500                            ) {
2501                                 int res = ip_route_input_mc(skb, daddr, saddr,
2502                                                             tos, dev, our);
2503                                 rcu_read_unlock();
2504                                 return res;
2505                         }
2506                 }
2507                 rcu_read_unlock();
2508                 return -EINVAL;
2509         }
2510         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2511         rcu_read_unlock();
2512         return res;
2513 }
2514 EXPORT_SYMBOL(ip_route_input_common);
2515
2516 /* called with rcu_read_lock() */
2517 static struct rtable *__mkroute_output(const struct fib_result *res,
2518                                        const struct flowi4 *fl4,
2519                                        __be32 orig_daddr, __be32 orig_saddr,
2520                                        int orig_oif, __u8 orig_rtos,
2521                                        struct net_device *dev_out,
2522                                        unsigned int flags)
2523 {
2524         struct fib_info *fi = res->fi;
2525         struct in_device *in_dev;
2526         u16 type = res->type;
2527         struct rtable *rth;
2528
2529         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2530                 return ERR_PTR(-EINVAL);
2531
2532         if (ipv4_is_lbcast(fl4->daddr))
2533                 type = RTN_BROADCAST;
2534         else if (ipv4_is_multicast(fl4->daddr))
2535                 type = RTN_MULTICAST;
2536         else if (ipv4_is_zeronet(fl4->daddr))
2537                 return ERR_PTR(-EINVAL);
2538
2539         if (dev_out->flags & IFF_LOOPBACK)
2540                 flags |= RTCF_LOCAL;
2541
2542         in_dev = __in_dev_get_rcu(dev_out);
2543         if (!in_dev)
2544                 return ERR_PTR(-EINVAL);
2545
2546         if (type == RTN_BROADCAST) {
2547                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2548                 fi = NULL;
2549         } else if (type == RTN_MULTICAST) {
2550                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2551                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2552                                      fl4->flowi4_proto))
2553                         flags &= ~RTCF_LOCAL;
2554                 /* If multicast route do not exist use
2555                  * default one, but do not gateway in this case.
2556                  * Yes, it is hack.
2557                  */
2558                 if (fi && res->prefixlen < 4)
2559                         fi = NULL;
2560         }
2561
2562         rth = rt_dst_alloc(dev_out,
2563                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2564                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2565         if (!rth)
2566                 return ERR_PTR(-ENOBUFS);
2567
2568         rth->dst.output = ip_output;
2569
2570         rth->rt_key_dst = orig_daddr;
2571         rth->rt_key_src = orig_saddr;
2572         rth->rt_genid = rt_genid(dev_net(dev_out));
2573         rth->rt_flags   = flags;
2574         rth->rt_type    = type;
2575         rth->rt_key_tos = orig_rtos;
2576         rth->rt_dst     = fl4->daddr;
2577         rth->rt_src     = fl4->saddr;
2578         rth->rt_route_iif = 0;
2579         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2580         rth->rt_oif     = orig_oif;
2581         rth->rt_mark    = fl4->flowi4_mark;
2582         rth->rt_gateway = fl4->daddr;
2583         rth->rt_spec_dst= fl4->saddr;
2584         rth->rt_peer_genid = 0;
2585         rth->peer = NULL;
2586         rth->fi = NULL;
2587
2588         RT_CACHE_STAT_INC(out_slow_tot);
2589
2590         if (flags & RTCF_LOCAL) {
2591                 rth->dst.input = ip_local_deliver;
2592                 rth->rt_spec_dst = fl4->daddr;
2593         }
2594         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2595                 rth->rt_spec_dst = fl4->saddr;
2596                 if (flags & RTCF_LOCAL &&
2597                     !(dev_out->flags & IFF_LOOPBACK)) {
2598                         rth->dst.output = ip_mc_output;
2599                         RT_CACHE_STAT_INC(out_slow_mc);
2600                 }
2601 #ifdef CONFIG_IP_MROUTE
2602                 if (type == RTN_MULTICAST) {
2603                         if (IN_DEV_MFORWARD(in_dev) &&
2604                             !ipv4_is_local_multicast(fl4->daddr)) {
2605                                 rth->dst.input = ip_mr_input;
2606                                 rth->dst.output = ip_mc_output;
2607                         }
2608                 }
2609 #endif
2610         }
2611
2612         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2613
2614         return rth;
2615 }
2616
2617 /*
2618  * Major route resolver routine.
2619  * called with rcu_read_lock();
2620  */
2621
2622 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2623 {
2624         struct net_device *dev_out = NULL;
2625         __u8 tos = RT_FL_TOS(fl4);
2626         unsigned int flags = 0;
2627         struct fib_result res;
2628         struct rtable *rth;
2629         __be32 orig_daddr;
2630         __be32 orig_saddr;
2631         int orig_oif;
2632
2633         res.fi          = NULL;
2634 #ifdef CONFIG_IP_MULTIPLE_TABLES
2635         res.r           = NULL;
2636 #endif
2637
2638         orig_daddr = fl4->daddr;
2639         orig_saddr = fl4->saddr;
2640         orig_oif = fl4->flowi4_oif;
2641
2642         fl4->flowi4_iif = net->loopback_dev->ifindex;
2643         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2644         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2645                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2646
2647         rcu_read_lock();
2648         if (fl4->saddr) {
2649                 rth = ERR_PTR(-EINVAL);
2650                 if (ipv4_is_multicast(fl4->saddr) ||
2651                     ipv4_is_lbcast(fl4->saddr) ||
2652                     ipv4_is_zeronet(fl4->saddr))
2653                         goto out;
2654
2655                 /* I removed check for oif == dev_out->oif here.
2656                    It was wrong for two reasons:
2657                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2658                       is assigned to multiple interfaces.
2659                    2. Moreover, we are allowed to send packets with saddr
2660                       of another iface. --ANK
2661                  */
2662
2663                 if (fl4->flowi4_oif == 0 &&
2664                     (ipv4_is_multicast(fl4->daddr) ||
2665                      ipv4_is_lbcast(fl4->daddr))) {
2666                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2667                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2668                         if (dev_out == NULL)
2669                                 goto out;
2670
2671                         /* Special hack: user can direct multicasts
2672                            and limited broadcast via necessary interface
2673                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2674                            This hack is not just for fun, it allows
2675                            vic,vat and friends to work.
2676                            They bind socket to loopback, set ttl to zero
2677                            and expect that it will work.
2678                            From the viewpoint of routing cache they are broken,
2679                            because we are not allowed to build multicast path
2680                            with loopback source addr (look, routing cache
2681                            cannot know, that ttl is zero, so that packet
2682                            will not leave this host and route is valid).
2683                            Luckily, this hack is good workaround.
2684                          */
2685
2686                         fl4->flowi4_oif = dev_out->ifindex;
2687                         goto make_route;
2688                 }
2689
2690                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2691                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2692                         if (!__ip_dev_find(net, fl4->saddr, false))
2693                                 goto out;
2694                 }
2695         }
2696
2697
2698         if (fl4->flowi4_oif) {
2699                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2700                 rth = ERR_PTR(-ENODEV);
2701                 if (dev_out == NULL)
2702                         goto out;
2703
2704                 /* RACE: Check return value of inet_select_addr instead. */
2705                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2706                         rth = ERR_PTR(-ENETUNREACH);
2707                         goto out;
2708                 }
2709                 if (ipv4_is_local_multicast(fl4->daddr) ||
2710                     ipv4_is_lbcast(fl4->daddr)) {
2711                         if (!fl4->saddr)
2712                                 fl4->saddr = inet_select_addr(dev_out, 0,
2713                                                               RT_SCOPE_LINK);
2714                         goto make_route;
2715                 }
2716                 if (fl4->saddr) {
2717                         if (ipv4_is_multicast(fl4->daddr))
2718                                 fl4->saddr = inet_select_addr(dev_out, 0,
2719                                                               fl4->flowi4_scope);
2720                         else if (!fl4->daddr)
2721                                 fl4->saddr = inet_select_addr(dev_out, 0,
2722                                                               RT_SCOPE_HOST);
2723                 }
2724         }
2725
2726         if (!fl4->daddr) {
2727                 fl4->daddr = fl4->saddr;
2728                 if (!fl4->daddr)
2729                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2730                 dev_out = net->loopback_dev;
2731                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2732                 res.type = RTN_LOCAL;
2733                 flags |= RTCF_LOCAL;
2734                 goto make_route;
2735         }
2736
2737         if (fib_lookup(net, fl4, &res)) {
2738                 res.fi = NULL;
2739                 if (fl4->flowi4_oif) {
2740                         /* Apparently, routing tables are wrong. Assume,
2741                            that the destination is on link.
2742
2743                            WHY? DW.
2744                            Because we are allowed to send to iface
2745                            even if it has NO routes and NO assigned
2746                            addresses. When oif is specified, routing
2747                            tables are looked up with only one purpose:
2748                            to catch if destination is gatewayed, rather than
2749                            direct. Moreover, if MSG_DONTROUTE is set,
2750                            we send packet, ignoring both routing tables
2751                            and ifaddr state. --ANK
2752
2753
2754                            We could make it even if oif is unknown,
2755                            likely IPv6, but we do not.
2756                          */
2757
2758                         if (fl4->saddr == 0)
2759                                 fl4->saddr = inet_select_addr(dev_out, 0,
2760                                                               RT_SCOPE_LINK);
2761                         res.type = RTN_UNICAST;
2762                         goto make_route;
2763                 }
2764                 rth = ERR_PTR(-ENETUNREACH);
2765                 goto out;
2766         }
2767
2768         if (res.type == RTN_LOCAL) {
2769                 if (!fl4->saddr) {
2770                         if (res.fi->fib_prefsrc)
2771                                 fl4->saddr = res.fi->fib_prefsrc;
2772                         else
2773                                 fl4->saddr = fl4->daddr;
2774                 }
2775                 dev_out = net->loopback_dev;
2776                 fl4->flowi4_oif = dev_out->ifindex;
2777                 res.fi = NULL;
2778                 flags |= RTCF_LOCAL;
2779                 goto make_route;
2780         }
2781
2782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2783         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2784                 fib_select_multipath(&res);
2785         else
2786 #endif
2787         if (!res.prefixlen &&
2788             res.table->tb_num_default > 1 &&
2789             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2790                 fib_select_default(&res);
2791
2792         if (!fl4->saddr)
2793                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2794
2795         dev_out = FIB_RES_DEV(res);
2796         fl4->flowi4_oif = dev_out->ifindex;
2797
2798
2799 make_route:
2800         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2801                                tos, dev_out, flags);
2802         if (!IS_ERR(rth)) {
2803                 unsigned int hash;
2804
2805                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2806                                rt_genid(dev_net(dev_out)));
2807                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2808         }
2809
2810 out:
2811         rcu_read_unlock();
2812         return rth;
2813 }
2814
2815 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2816 {
2817         struct rtable *rth;
2818         unsigned int hash;
2819
2820         if (!rt_caching(net))
2821                 goto slow_output;
2822
2823         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2824
2825         rcu_read_lock_bh();
2826         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2827                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2828                 if (rth->rt_key_dst == flp4->daddr &&
2829                     rth->rt_key_src == flp4->saddr &&
2830                     rt_is_output_route(rth) &&
2831                     rth->rt_oif == flp4->flowi4_oif &&
2832                     rth->rt_mark == flp4->flowi4_mark &&
2833                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2834                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2835                     net_eq(dev_net(rth->dst.dev), net) &&
2836                     !rt_is_expired(rth)) {
2837                         ipv4_validate_peer(rth);
2838                         dst_use(&rth->dst, jiffies);
2839                         RT_CACHE_STAT_INC(out_hit);
2840                         rcu_read_unlock_bh();
2841                         if (!flp4->saddr)
2842                                 flp4->saddr = rth->rt_src;
2843                         if (!flp4->daddr)
2844                                 flp4->daddr = rth->rt_dst;
2845                         return rth;
2846                 }
2847                 RT_CACHE_STAT_INC(out_hlist_search);
2848         }
2849         rcu_read_unlock_bh();
2850
2851 slow_output:
2852         return ip_route_output_slow(net, flp4);
2853 }
2854 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2855
2856 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2857 {
2858         return NULL;
2859 }
2860
2861 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2862 {
2863         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2864
2865         return mtu ? : dst->dev->mtu;
2866 }
2867
2868 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2869 {
2870 }
2871
2872 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2873                                           unsigned long old)
2874 {
2875         return NULL;
2876 }
2877
2878 static struct dst_ops ipv4_dst_blackhole_ops = {
2879         .family                 =       AF_INET,
2880         .protocol               =       cpu_to_be16(ETH_P_IP),
2881         .destroy                =       ipv4_dst_destroy,
2882         .check                  =       ipv4_blackhole_dst_check,
2883         .mtu                    =       ipv4_blackhole_mtu,
2884         .default_advmss         =       ipv4_default_advmss,
2885         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2886         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2887         .neigh_lookup           =       ipv4_neigh_lookup,
2888 };
2889
2890 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2891 {
2892         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2893         struct rtable *ort = (struct rtable *) dst_orig;
2894
2895         if (rt) {
2896                 struct dst_entry *new = &rt->dst;
2897
2898                 new->__use = 1;
2899                 new->input = dst_discard;
2900                 new->output = dst_discard;
2901                 dst_copy_metrics(new, &ort->dst);
2902
2903                 new->dev = ort->dst.dev;
2904                 if (new->dev)
2905                         dev_hold(new->dev);
2906
2907                 rt->rt_key_dst = ort->rt_key_dst;
2908                 rt->rt_key_src = ort->rt_key_src;
2909                 rt->rt_key_tos = ort->rt_key_tos;
2910                 rt->rt_route_iif = ort->rt_route_iif;
2911                 rt->rt_iif = ort->rt_iif;
2912                 rt->rt_oif = ort->rt_oif;
2913                 rt->rt_mark = ort->rt_mark;
2914
2915                 rt->rt_genid = rt_genid(net);
2916                 rt->rt_flags = ort->rt_flags;
2917                 rt->rt_type = ort->rt_type;
2918                 rt->rt_dst = ort->rt_dst;
2919                 rt->rt_src = ort->rt_src;
2920                 rt->rt_gateway = ort->rt_gateway;
2921                 rt->rt_spec_dst = ort->rt_spec_dst;
2922                 rt->peer = ort->peer;
2923                 if (rt->peer)
2924                         atomic_inc(&rt->peer->refcnt);
2925                 rt->fi = ort->fi;
2926                 if (rt->fi)
2927                         atomic_inc(&rt->fi->fib_clntref);
2928
2929                 dst_free(new);
2930         }
2931
2932         dst_release(dst_orig);
2933
2934         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2935 }
2936
2937 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2938                                     struct sock *sk)
2939 {
2940         struct rtable *rt = __ip_route_output_key(net, flp4);
2941
2942         if (IS_ERR(rt))
2943                 return rt;
2944
2945         if (flp4->flowi4_proto)
2946                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2947                                                    flowi4_to_flowi(flp4),
2948                                                    sk, 0);
2949
2950         return rt;
2951 }
2952 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2953
2954 static int rt_fill_info(struct net *net,
2955                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2956                         int nowait, unsigned int flags)
2957 {
2958         struct rtable *rt = skb_rtable(skb);
2959         struct rtmsg *r;
2960         struct nlmsghdr *nlh;
2961         unsigned long expires = 0;
2962         const struct inet_peer *peer = rt->peer;
2963         u32 id = 0, ts = 0, tsage = 0, error;
2964
2965         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2966         if (nlh == NULL)
2967                 return -EMSGSIZE;
2968
2969         r = nlmsg_data(nlh);
2970         r->rtm_family    = AF_INET;
2971         r->rtm_dst_len  = 32;
2972         r->rtm_src_len  = 0;
2973         r->rtm_tos      = rt->rt_key_tos;
2974         r->rtm_table    = RT_TABLE_MAIN;
2975         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2976         r->rtm_type     = rt->rt_type;
2977         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2978         r->rtm_protocol = RTPROT_UNSPEC;
2979         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2980         if (rt->rt_flags & RTCF_NOTIFY)
2981                 r->rtm_flags |= RTM_F_NOTIFY;
2982
2983         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2984
2985         if (rt->rt_key_src) {
2986                 r->rtm_src_len = 32;
2987                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2988         }
2989         if (rt->dst.dev)
2990                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2991 #ifdef CONFIG_IP_ROUTE_CLASSID
2992         if (rt->dst.tclassid)
2993                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2994 #endif
2995         if (rt_is_input_route(rt))
2996                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2997         else if (rt->rt_src != rt->rt_key_src)
2998                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2999
3000         if (rt->rt_dst != rt->rt_gateway)
3001                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3002
3003         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3004                 goto nla_put_failure;
3005
3006         if (rt->rt_mark)
3007                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3008
3009         error = rt->dst.error;
3010         if (peer) {
3011                 inet_peer_refcheck(rt->peer);
3012                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3013                 if (peer->tcp_ts_stamp) {
3014                         ts = peer->tcp_ts;
3015                         tsage = get_seconds() - peer->tcp_ts_stamp;
3016                 }
3017                 expires = ACCESS_ONCE(peer->pmtu_expires);
3018                 if (expires) {
3019                         if (time_before(jiffies, expires))
3020                                 expires -= jiffies;
3021                         else
3022                                 expires = 0;
3023                 }
3024         }
3025
3026         if (rt_is_input_route(rt)) {
3027 #ifdef CONFIG_IP_MROUTE
3028                 __be32 dst = rt->rt_dst;
3029
3030                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3031                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3032                         int err = ipmr_get_route(net, skb,
3033                                                  rt->rt_src, rt->rt_dst,
3034                                                  r, nowait);
3035                         if (err <= 0) {
3036                                 if (!nowait) {
3037                                         if (err == 0)
3038                                                 return 0;
3039                                         goto nla_put_failure;
3040                                 } else {
3041                                         if (err == -EMSGSIZE)
3042                                                 goto nla_put_failure;
3043                                         error = err;
3044                                 }
3045                         }
3046                 } else
3047 #endif
3048                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3049         }
3050
3051         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3052                                expires, error) < 0)
3053                 goto nla_put_failure;
3054
3055         return nlmsg_end(skb, nlh);
3056
3057 nla_put_failure:
3058         nlmsg_cancel(skb, nlh);
3059         return -EMSGSIZE;
3060 }
3061
3062 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3063 {
3064         struct net *net = sock_net(in_skb->sk);
3065         struct rtmsg *rtm;
3066         struct nlattr *tb[RTA_MAX+1];
3067         struct rtable *rt = NULL;
3068         __be32 dst = 0;
3069         __be32 src = 0;
3070         u32 iif;
3071         int err;
3072         int mark;
3073         struct sk_buff *skb;
3074
3075         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3076         if (err < 0)
3077                 goto errout;
3078
3079         rtm = nlmsg_data(nlh);
3080
3081         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3082         if (skb == NULL) {
3083                 err = -ENOBUFS;
3084                 goto errout;
3085         }
3086
3087         /* Reserve room for dummy headers, this skb can pass
3088            through good chunk of routing engine.
3089          */
3090         skb_reset_mac_header(skb);
3091         skb_reset_network_header(skb);
3092
3093         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3094         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3095         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3096
3097         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3098         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3099         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3100         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3101
3102         if (iif) {
3103                 struct net_device *dev;
3104
3105                 dev = __dev_get_by_index(net, iif);
3106                 if (dev == NULL) {
3107                         err = -ENODEV;
3108                         goto errout_free;
3109                 }
3110
3111                 skb->protocol   = htons(ETH_P_IP);
3112                 skb->dev        = dev;
3113                 skb->mark       = mark;
3114                 local_bh_disable();
3115                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3116                 local_bh_enable();
3117
3118                 rt = skb_rtable(skb);
3119                 if (err == 0 && rt->dst.error)
3120                         err = -rt->dst.error;
3121         } else {
3122                 struct flowi4 fl4 = {
3123                         .daddr = dst,
3124                         .saddr = src,
3125                         .flowi4_tos = rtm->rtm_tos,
3126                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3127                         .flowi4_mark = mark,
3128                 };
3129                 rt = ip_route_output_key(net, &fl4);
3130
3131                 err = 0;
3132                 if (IS_ERR(rt))
3133                         err = PTR_ERR(rt);
3134         }
3135
3136         if (err)
3137                 goto errout_free;
3138
3139         skb_dst_set(skb, &rt->dst);
3140         if (rtm->rtm_flags & RTM_F_NOTIFY)
3141                 rt->rt_flags |= RTCF_NOTIFY;
3142
3143         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3144                            RTM_NEWROUTE, 0, 0);
3145         if (err <= 0)
3146                 goto errout_free;
3147
3148         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3149 errout:
3150         return err;
3151
3152 errout_free:
3153         kfree_skb(skb);
3154         goto errout;
3155 }
3156
3157 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3158 {
3159         struct rtable *rt;
3160         int h, s_h;
3161         int idx, s_idx;
3162         struct net *net;
3163
3164         net = sock_net(skb->sk);
3165
3166         s_h = cb->args[0];
3167         if (s_h < 0)
3168                 s_h = 0;
3169         s_idx = idx = cb->args[1];
3170         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3171                 if (!rt_hash_table[h].chain)
3172                         continue;
3173                 rcu_read_lock_bh();
3174                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3175                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3176                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3177                                 continue;
3178                         if (rt_is_expired(rt))
3179                                 continue;
3180                         skb_dst_set_noref(skb, &rt->dst);
3181                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3182                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3183                                          1, NLM_F_MULTI) <= 0) {
3184                                 skb_dst_drop(skb);
3185                                 rcu_read_unlock_bh();
3186                                 goto done;
3187                         }
3188                         skb_dst_drop(skb);
3189                 }
3190                 rcu_read_unlock_bh();
3191         }
3192
3193 done:
3194         cb->args[0] = h;
3195         cb->args[1] = idx;
3196         return skb->len;
3197 }
3198
3199 void ip_rt_multicast_event(struct in_device *in_dev)
3200 {
3201         rt_cache_flush(dev_net(in_dev->dev), 0);
3202 }
3203
3204 #ifdef CONFIG_SYSCTL
3205 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3206                                         void __user *buffer,
3207                                         size_t *lenp, loff_t *ppos)
3208 {
3209         if (write) {
3210                 int flush_delay;
3211                 ctl_table ctl;
3212                 struct net *net;
3213
3214                 memcpy(&ctl, __ctl, sizeof(ctl));
3215                 ctl.data = &flush_delay;
3216                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3217
3218                 net = (struct net *)__ctl->extra1;
3219                 rt_cache_flush(net, flush_delay);
3220                 return 0;
3221         }
3222
3223         return -EINVAL;
3224 }
3225
3226 static ctl_table ipv4_route_table[] = {
3227         {
3228                 .procname       = "gc_thresh",
3229                 .data           = &ipv4_dst_ops.gc_thresh,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "max_size",
3236                 .data           = &ip_rt_max_size,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         {
3242                 /*  Deprecated. Use gc_min_interval_ms */
3243
3244                 .procname       = "gc_min_interval",
3245                 .data           = &ip_rt_gc_min_interval,
3246                 .maxlen         = sizeof(int),
3247                 .mode           = 0644,
3248                 .proc_handler   = proc_dointvec_jiffies,
3249         },
3250         {
3251                 .procname       = "gc_min_interval_ms",
3252                 .data           = &ip_rt_gc_min_interval,
3253                 .maxlen         = sizeof(int),
3254                 .mode           = 0644,
3255                 .proc_handler   = proc_dointvec_ms_jiffies,
3256         },
3257         {
3258                 .procname       = "gc_timeout",
3259                 .data           = &ip_rt_gc_timeout,
3260                 .maxlen         = sizeof(int),
3261                 .mode           = 0644,
3262                 .proc_handler   = proc_dointvec_jiffies,
3263         },
3264         {
3265                 .procname       = "gc_interval",
3266                 .data           = &ip_rt_gc_interval,
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0644,
3269                 .proc_handler   = proc_dointvec_jiffies,
3270         },
3271         {
3272                 .procname       = "redirect_load",
3273                 .data           = &ip_rt_redirect_load,
3274                 .maxlen         = sizeof(int),
3275                 .mode           = 0644,
3276                 .proc_handler   = proc_dointvec,
3277         },
3278         {
3279                 .procname       = "redirect_number",
3280                 .data           = &ip_rt_redirect_number,
3281                 .maxlen         = sizeof(int),
3282                 .mode           = 0644,
3283                 .proc_handler   = proc_dointvec,
3284         },
3285         {
3286                 .procname       = "redirect_silence",
3287                 .data           = &ip_rt_redirect_silence,
3288                 .maxlen         = sizeof(int),
3289                 .mode           = 0644,
3290                 .proc_handler   = proc_dointvec,
3291         },
3292         {
3293                 .procname       = "error_cost",
3294                 .data           = &ip_rt_error_cost,
3295                 .maxlen         = sizeof(int),
3296                 .mode           = 0644,
3297                 .proc_handler   = proc_dointvec,
3298         },
3299         {
3300                 .procname       = "error_burst",
3301                 .data           = &ip_rt_error_burst,
3302                 .maxlen         = sizeof(int),
3303                 .mode           = 0644,
3304                 .proc_handler   = proc_dointvec,
3305         },
3306         {
3307                 .procname       = "gc_elasticity",
3308                 .data           = &ip_rt_gc_elasticity,
3309                 .maxlen         = sizeof(int),
3310                 .mode           = 0644,
3311                 .proc_handler   = proc_dointvec,
3312         },
3313         {
3314                 .procname       = "mtu_expires",
3315                 .data           = &ip_rt_mtu_expires,
3316                 .maxlen         = sizeof(int),
3317                 .mode           = 0644,
3318                 .proc_handler   = proc_dointvec_jiffies,
3319         },
3320         {
3321                 .procname       = "min_pmtu",
3322                 .data           = &ip_rt_min_pmtu,
3323                 .maxlen         = sizeof(int),
3324                 .mode           = 0644,
3325                 .proc_handler   = proc_dointvec,
3326         },
3327         {
3328                 .procname       = "min_adv_mss",
3329                 .data           = &ip_rt_min_advmss,
3330                 .maxlen         = sizeof(int),
3331                 .mode           = 0644,
3332                 .proc_handler   = proc_dointvec,
3333         },
3334         { }
3335 };
3336
3337 static struct ctl_table empty[1];
3338
3339 static struct ctl_table ipv4_skeleton[] =
3340 {
3341         { .procname = "route", 
3342           .mode = 0555, .child = ipv4_route_table},
3343         { .procname = "neigh", 
3344           .mode = 0555, .child = empty},
3345         { }
3346 };
3347
3348 static __net_initdata struct ctl_path ipv4_path[] = {
3349         { .procname = "net", },
3350         { .procname = "ipv4", },
3351         { },
3352 };
3353
3354 static struct ctl_table ipv4_route_flush_table[] = {
3355         {
3356                 .procname       = "flush",
3357                 .maxlen         = sizeof(int),
3358                 .mode           = 0200,
3359                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3360         },
3361         { },
3362 };
3363
3364 static __net_initdata struct ctl_path ipv4_route_path[] = {
3365         { .procname = "net", },
3366         { .procname = "ipv4", },
3367         { .procname = "route", },
3368         { },
3369 };
3370
3371 static __net_init int sysctl_route_net_init(struct net *net)
3372 {
3373         struct ctl_table *tbl;
3374
3375         tbl = ipv4_route_flush_table;
3376         if (!net_eq(net, &init_net)) {
3377                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3378                 if (tbl == NULL)
3379                         goto err_dup;
3380         }
3381         tbl[0].extra1 = net;
3382
3383         net->ipv4.route_hdr =
3384                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3385         if (net->ipv4.route_hdr == NULL)
3386                 goto err_reg;
3387         return 0;
3388
3389 err_reg:
3390         if (tbl != ipv4_route_flush_table)
3391                 kfree(tbl);
3392 err_dup:
3393         return -ENOMEM;
3394 }
3395
3396 static __net_exit void sysctl_route_net_exit(struct net *net)
3397 {
3398         struct ctl_table *tbl;
3399
3400         tbl = net->ipv4.route_hdr->ctl_table_arg;
3401         unregister_net_sysctl_table(net->ipv4.route_hdr);
3402         BUG_ON(tbl == ipv4_route_flush_table);
3403         kfree(tbl);
3404 }
3405
3406 static __net_initdata struct pernet_operations sysctl_route_ops = {
3407         .init = sysctl_route_net_init,
3408         .exit = sysctl_route_net_exit,
3409 };
3410 #endif
3411
3412 static __net_init int rt_genid_init(struct net *net)
3413 {
3414         get_random_bytes(&net->ipv4.rt_genid,
3415                          sizeof(net->ipv4.rt_genid));
3416         get_random_bytes(&net->ipv4.dev_addr_genid,
3417                          sizeof(net->ipv4.dev_addr_genid));
3418         return 0;
3419 }
3420
3421 static __net_initdata struct pernet_operations rt_genid_ops = {
3422         .init = rt_genid_init,
3423 };
3424
3425
3426 #ifdef CONFIG_IP_ROUTE_CLASSID
3427 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3428 #endif /* CONFIG_IP_ROUTE_CLASSID */
3429
3430 static __initdata unsigned long rhash_entries;
3431 static int __init set_rhash_entries(char *str)
3432 {
3433         if (!str)
3434                 return 0;
3435         rhash_entries = simple_strtoul(str, &str, 0);
3436         return 1;
3437 }
3438 __setup("rhash_entries=", set_rhash_entries);
3439
3440 int __init ip_rt_init(void)
3441 {
3442         int rc = 0;
3443
3444 #ifdef CONFIG_IP_ROUTE_CLASSID
3445         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3446         if (!ip_rt_acct)
3447                 panic("IP: failed to allocate ip_rt_acct\n");
3448 #endif
3449
3450         ipv4_dst_ops.kmem_cachep =
3451                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3452                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3453
3454         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3455
3456         if (dst_entries_init(&ipv4_dst_ops) < 0)
3457                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3458
3459         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3460                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3461
3462         rt_hash_table = (struct rt_hash_bucket *)
3463                 alloc_large_system_hash("IP route cache",
3464                                         sizeof(struct rt_hash_bucket),
3465                                         rhash_entries,
3466                                         (totalram_pages >= 128 * 1024) ?
3467                                         15 : 17,
3468                                         0,
3469                                         &rt_hash_log,
3470                                         &rt_hash_mask,
3471                                         rhash_entries ? 0 : 512 * 1024);
3472         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3473         rt_hash_lock_init();
3474
3475         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3476         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3477
3478         devinet_init();
3479         ip_fib_init();
3480
3481         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3482         expires_ljiffies = jiffies;
3483         schedule_delayed_work(&expires_work,
3484                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3485
3486         if (ip_rt_proc_init())
3487                 pr_err("Unable to create route proc files\n");
3488 #ifdef CONFIG_XFRM
3489         xfrm_init();
3490         xfrm4_init(ip_rt_max_size);
3491 #endif
3492         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3493
3494 #ifdef CONFIG_SYSCTL
3495         register_pernet_subsys(&sysctl_route_ops);
3496 #endif
3497         register_pernet_subsys(&rt_genid_ops);
3498         return rc;
3499 }
3500
3501 #ifdef CONFIG_SYSCTL
3502 /*
3503  * We really need to sanitize the damn ipv4 init order, then all
3504  * this nonsense will go away.
3505  */
3506 void __init ip_static_sysctl_init(void)
3507 {
3508         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3509 }
3510 #endif