route: Remove redirect_genid
[linux-flexiantxendom0-3.2.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define IP_MAX_MTU      0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly    = 8;
131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly       = 256;
134 static int rt_chain_length_max __read_mostly    = 20;
135
136 static struct delayed_work expires_work;
137 static unsigned long expires_ljiffies;
138
139 /*
140  *      Interface to generic destination cache.
141  */
142
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
145 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void              ipv4_link_failure(struct sk_buff *skb);
149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153                             int how)
154 {
155 }
156
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159         struct rtable *rt = (struct rtable *) dst;
160         struct inet_peer *peer;
161         u32 *p = NULL;
162
163         if (!rt->peer)
164                 rt_bind_peer(rt, rt->rt_dst, 1);
165
166         peer = rt->peer;
167         if (peer) {
168                 u32 *old_p = __DST_METRICS_PTR(old);
169                 unsigned long prev, new;
170
171                 p = peer->metrics;
172                 if (inet_metrics_new(peer))
173                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
174
175                 new = (unsigned long) p;
176                 prev = cmpxchg(&dst->_metrics, old, new);
177
178                 if (prev != old) {
179                         p = __DST_METRICS_PTR(prev);
180                         if (prev & DST_METRICS_READ_ONLY)
181                                 p = NULL;
182                 } else {
183                         if (rt->fi) {
184                                 fib_info_put(rt->fi);
185                                 rt->fi = NULL;
186                         }
187                 }
188         }
189         return p;
190 }
191
192 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
194 static struct dst_ops ipv4_dst_ops = {
195         .family =               AF_INET,
196         .protocol =             cpu_to_be16(ETH_P_IP),
197         .gc =                   rt_garbage_collect,
198         .check =                ipv4_dst_check,
199         .default_advmss =       ipv4_default_advmss,
200         .mtu =                  ipv4_mtu,
201         .cow_metrics =          ipv4_cow_metrics,
202         .destroy =              ipv4_dst_destroy,
203         .ifdown =               ipv4_dst_ifdown,
204         .negative_advice =      ipv4_negative_advice,
205         .link_failure =         ipv4_link_failure,
206         .update_pmtu =          ip_rt_update_pmtu,
207         .local_out =            __ip_local_out,
208         .neigh_lookup =         ipv4_neigh_lookup,
209 };
210
211 #define ECN_OR_COST(class)      TC_PRIO_##class
212
213 const __u8 ip_tos2prio[16] = {
214         TC_PRIO_BESTEFFORT,
215         ECN_OR_COST(BESTEFFORT),
216         TC_PRIO_BESTEFFORT,
217         ECN_OR_COST(BESTEFFORT),
218         TC_PRIO_BULK,
219         ECN_OR_COST(BULK),
220         TC_PRIO_BULK,
221         ECN_OR_COST(BULK),
222         TC_PRIO_INTERACTIVE,
223         ECN_OR_COST(INTERACTIVE),
224         TC_PRIO_INTERACTIVE,
225         ECN_OR_COST(INTERACTIVE),
226         TC_PRIO_INTERACTIVE_BULK,
227         ECN_OR_COST(INTERACTIVE_BULK),
228         TC_PRIO_INTERACTIVE_BULK,
229         ECN_OR_COST(INTERACTIVE_BULK)
230 };
231
232
233 /*
234  * Route cache.
235  */
236
237 /* The locking scheme is rather straight forward:
238  *
239  * 1) Read-Copy Update protects the buckets of the central route hash.
240  * 2) Only writers remove entries, and they hold the lock
241  *    as they look at rtable reference counts.
242  * 3) Only readers acquire references to rtable entries,
243  *    they do so with atomic increments and with the
244  *    lock held.
245  */
246
247 struct rt_hash_bucket {
248         struct rtable __rcu     *chain;
249 };
250
251 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252         defined(CONFIG_PROVE_LOCKING)
253 /*
254  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255  * The size of this table is a power of two and depends on the number of CPUS.
256  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
257  */
258 #ifdef CONFIG_LOCKDEP
259 # define RT_HASH_LOCK_SZ        256
260 #else
261 # if NR_CPUS >= 32
262 #  define RT_HASH_LOCK_SZ       4096
263 # elif NR_CPUS >= 16
264 #  define RT_HASH_LOCK_SZ       2048
265 # elif NR_CPUS >= 8
266 #  define RT_HASH_LOCK_SZ       1024
267 # elif NR_CPUS >= 4
268 #  define RT_HASH_LOCK_SZ       512
269 # else
270 #  define RT_HASH_LOCK_SZ       256
271 # endif
272 #endif
273
274 static spinlock_t       *rt_hash_locks;
275 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
276
277 static __init void rt_hash_lock_init(void)
278 {
279         int i;
280
281         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282                         GFP_KERNEL);
283         if (!rt_hash_locks)
284                 panic("IP: failed to allocate rt_hash_locks\n");
285
286         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287                 spin_lock_init(&rt_hash_locks[i]);
288 }
289 #else
290 # define rt_hash_lock_addr(slot) NULL
291
292 static inline void rt_hash_lock_init(void)
293 {
294 }
295 #endif
296
297 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
298 static unsigned                 rt_hash_mask __read_mostly;
299 static unsigned int             rt_hash_log  __read_mostly;
300
301 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
302 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
303
304 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
305                                    int genid)
306 {
307         return jhash_3words((__force u32)daddr, (__force u32)saddr,
308                             idx, genid)
309                 & rt_hash_mask;
310 }
311
312 static inline int rt_genid(struct net *net)
313 {
314         return atomic_read(&net->ipv4.rt_genid);
315 }
316
317 #ifdef CONFIG_PROC_FS
318 struct rt_cache_iter_state {
319         struct seq_net_private p;
320         int bucket;
321         int genid;
322 };
323
324 static struct rtable *rt_cache_get_first(struct seq_file *seq)
325 {
326         struct rt_cache_iter_state *st = seq->private;
327         struct rtable *r = NULL;
328
329         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
330                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
331                         continue;
332                 rcu_read_lock_bh();
333                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
334                 while (r) {
335                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
336                             r->rt_genid == st->genid)
337                                 return r;
338                         r = rcu_dereference_bh(r->dst.rt_next);
339                 }
340                 rcu_read_unlock_bh();
341         }
342         return r;
343 }
344
345 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
346                                           struct rtable *r)
347 {
348         struct rt_cache_iter_state *st = seq->private;
349
350         r = rcu_dereference_bh(r->dst.rt_next);
351         while (!r) {
352                 rcu_read_unlock_bh();
353                 do {
354                         if (--st->bucket < 0)
355                                 return NULL;
356                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
357                 rcu_read_lock_bh();
358                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
359         }
360         return r;
361 }
362
363 static struct rtable *rt_cache_get_next(struct seq_file *seq,
364                                         struct rtable *r)
365 {
366         struct rt_cache_iter_state *st = seq->private;
367         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
368                 if (dev_net(r->dst.dev) != seq_file_net(seq))
369                         continue;
370                 if (r->rt_genid == st->genid)
371                         break;
372         }
373         return r;
374 }
375
376 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
377 {
378         struct rtable *r = rt_cache_get_first(seq);
379
380         if (r)
381                 while (pos && (r = rt_cache_get_next(seq, r)))
382                         --pos;
383         return pos ? NULL : r;
384 }
385
386 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387 {
388         struct rt_cache_iter_state *st = seq->private;
389         if (*pos)
390                 return rt_cache_get_idx(seq, *pos - 1);
391         st->genid = rt_genid(seq_file_net(seq));
392         return SEQ_START_TOKEN;
393 }
394
395 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 {
397         struct rtable *r;
398
399         if (v == SEQ_START_TOKEN)
400                 r = rt_cache_get_first(seq);
401         else
402                 r = rt_cache_get_next(seq, v);
403         ++*pos;
404         return r;
405 }
406
407 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408 {
409         if (v && v != SEQ_START_TOKEN)
410                 rcu_read_unlock_bh();
411 }
412
413 static int rt_cache_seq_show(struct seq_file *seq, void *v)
414 {
415         if (v == SEQ_START_TOKEN)
416                 seq_printf(seq, "%-127s\n",
417                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419                            "HHUptod\tSpecDst");
420         else {
421                 struct rtable *r = v;
422                 struct neighbour *n;
423                 int len, HHUptod;
424
425                 rcu_read_lock();
426                 n = dst_get_neighbour_noref(&r->dst);
427                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428                 rcu_read_unlock();
429
430                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
432                         r->dst.dev ? r->dst.dev->name : "*",
433                         (__force u32)r->rt_dst,
434                         (__force u32)r->rt_gateway,
435                         r->rt_flags, atomic_read(&r->dst.__refcnt),
436                         r->dst.__use, 0, (__force u32)r->rt_src,
437                         dst_metric_advmss(&r->dst) + 40,
438                         dst_metric(&r->dst, RTAX_WINDOW),
439                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440                               dst_metric(&r->dst, RTAX_RTTVAR)),
441                         r->rt_key_tos,
442                         -1,
443                         HHUptod,
444                         r->rt_spec_dst, &len);
445
446                 seq_printf(seq, "%*s\n", 127 - len, "");
447         }
448         return 0;
449 }
450
451 static const struct seq_operations rt_cache_seq_ops = {
452         .start  = rt_cache_seq_start,
453         .next   = rt_cache_seq_next,
454         .stop   = rt_cache_seq_stop,
455         .show   = rt_cache_seq_show,
456 };
457
458 static int rt_cache_seq_open(struct inode *inode, struct file *file)
459 {
460         return seq_open_net(inode, file, &rt_cache_seq_ops,
461                         sizeof(struct rt_cache_iter_state));
462 }
463
464 static const struct file_operations rt_cache_seq_fops = {
465         .owner   = THIS_MODULE,
466         .open    = rt_cache_seq_open,
467         .read    = seq_read,
468         .llseek  = seq_lseek,
469         .release = seq_release_net,
470 };
471
472
473 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474 {
475         int cpu;
476
477         if (*pos == 0)
478                 return SEQ_START_TOKEN;
479
480         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
481                 if (!cpu_possible(cpu))
482                         continue;
483                 *pos = cpu+1;
484                 return &per_cpu(rt_cache_stat, cpu);
485         }
486         return NULL;
487 }
488
489 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 {
491         int cpu;
492
493         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
494                 if (!cpu_possible(cpu))
495                         continue;
496                 *pos = cpu+1;
497                 return &per_cpu(rt_cache_stat, cpu);
498         }
499         return NULL;
500
501 }
502
503 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504 {
505
506 }
507
508 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509 {
510         struct rt_cache_stat *st = v;
511
512         if (v == SEQ_START_TOKEN) {
513                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514                 return 0;
515         }
516
517         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
518                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
519                    dst_entries_get_slow(&ipv4_dst_ops),
520                    st->in_hit,
521                    st->in_slow_tot,
522                    st->in_slow_mc,
523                    st->in_no_route,
524                    st->in_brd,
525                    st->in_martian_dst,
526                    st->in_martian_src,
527
528                    st->out_hit,
529                    st->out_slow_tot,
530                    st->out_slow_mc,
531
532                    st->gc_total,
533                    st->gc_ignored,
534                    st->gc_goal_miss,
535                    st->gc_dst_overflow,
536                    st->in_hlist_search,
537                    st->out_hlist_search
538                 );
539         return 0;
540 }
541
542 static const struct seq_operations rt_cpu_seq_ops = {
543         .start  = rt_cpu_seq_start,
544         .next   = rt_cpu_seq_next,
545         .stop   = rt_cpu_seq_stop,
546         .show   = rt_cpu_seq_show,
547 };
548
549
550 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551 {
552         return seq_open(file, &rt_cpu_seq_ops);
553 }
554
555 static const struct file_operations rt_cpu_seq_fops = {
556         .owner   = THIS_MODULE,
557         .open    = rt_cpu_seq_open,
558         .read    = seq_read,
559         .llseek  = seq_lseek,
560         .release = seq_release,
561 };
562
563 #ifdef CONFIG_IP_ROUTE_CLASSID
564 static int rt_acct_proc_show(struct seq_file *m, void *v)
565 {
566         struct ip_rt_acct *dst, *src;
567         unsigned int i, j;
568
569         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570         if (!dst)
571                 return -ENOMEM;
572
573         for_each_possible_cpu(i) {
574                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575                 for (j = 0; j < 256; j++) {
576                         dst[j].o_bytes   += src[j].o_bytes;
577                         dst[j].o_packets += src[j].o_packets;
578                         dst[j].i_bytes   += src[j].i_bytes;
579                         dst[j].i_packets += src[j].i_packets;
580                 }
581         }
582
583         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584         kfree(dst);
585         return 0;
586 }
587
588 static int rt_acct_proc_open(struct inode *inode, struct file *file)
589 {
590         return single_open(file, rt_acct_proc_show, NULL);
591 }
592
593 static const struct file_operations rt_acct_proc_fops = {
594         .owner          = THIS_MODULE,
595         .open           = rt_acct_proc_open,
596         .read           = seq_read,
597         .llseek         = seq_lseek,
598         .release        = single_release,
599 };
600 #endif
601
602 static int __net_init ip_rt_do_proc_init(struct net *net)
603 {
604         struct proc_dir_entry *pde;
605
606         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607                         &rt_cache_seq_fops);
608         if (!pde)
609                 goto err1;
610
611         pde = proc_create("rt_cache", S_IRUGO,
612                           net->proc_net_stat, &rt_cpu_seq_fops);
613         if (!pde)
614                 goto err2;
615
616 #ifdef CONFIG_IP_ROUTE_CLASSID
617         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
618         if (!pde)
619                 goto err3;
620 #endif
621         return 0;
622
623 #ifdef CONFIG_IP_ROUTE_CLASSID
624 err3:
625         remove_proc_entry("rt_cache", net->proc_net_stat);
626 #endif
627 err2:
628         remove_proc_entry("rt_cache", net->proc_net);
629 err1:
630         return -ENOMEM;
631 }
632
633 static void __net_exit ip_rt_do_proc_exit(struct net *net)
634 {
635         remove_proc_entry("rt_cache", net->proc_net_stat);
636         remove_proc_entry("rt_cache", net->proc_net);
637 #ifdef CONFIG_IP_ROUTE_CLASSID
638         remove_proc_entry("rt_acct", net->proc_net);
639 #endif
640 }
641
642 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
643         .init = ip_rt_do_proc_init,
644         .exit = ip_rt_do_proc_exit,
645 };
646
647 static int __init ip_rt_proc_init(void)
648 {
649         return register_pernet_subsys(&ip_rt_proc_ops);
650 }
651
652 #else
653 static inline int ip_rt_proc_init(void)
654 {
655         return 0;
656 }
657 #endif /* CONFIG_PROC_FS */
658
659 static inline void rt_free(struct rtable *rt)
660 {
661         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
662 }
663
664 static inline void rt_drop(struct rtable *rt)
665 {
666         ip_rt_put(rt);
667         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
668 }
669
670 static inline int rt_fast_clean(struct rtable *rth)
671 {
672         /* Kill broadcast/multicast entries very aggresively, if they
673            collide in hash table with more useful entries */
674         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
675                 rt_is_input_route(rth) && rth->dst.rt_next;
676 }
677
678 static inline int rt_valuable(struct rtable *rth)
679 {
680         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
681                 (rth->peer && rth->peer->pmtu_expires);
682 }
683
684 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685 {
686         unsigned long age;
687         int ret = 0;
688
689         if (atomic_read(&rth->dst.__refcnt))
690                 goto out;
691
692         age = jiffies - rth->dst.lastuse;
693         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694             (age <= tmo2 && rt_valuable(rth)))
695                 goto out;
696         ret = 1;
697 out:    return ret;
698 }
699
700 /* Bits of score are:
701  * 31: very valuable
702  * 30: not quite useless
703  * 29..0: usage counter
704  */
705 static inline u32 rt_score(struct rtable *rt)
706 {
707         u32 score = jiffies - rt->dst.lastuse;
708
709         score = ~score & ~(3<<30);
710
711         if (rt_valuable(rt))
712                 score |= (1<<31);
713
714         if (rt_is_output_route(rt) ||
715             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716                 score |= (1<<30);
717
718         return score;
719 }
720
721 static inline bool rt_caching(const struct net *net)
722 {
723         return net->ipv4.current_rt_cache_rebuild_count <=
724                 net->ipv4.sysctl_rt_cache_rebuild_count;
725 }
726
727 static inline bool compare_hash_inputs(const struct rtable *rt1,
728                                        const struct rtable *rt2)
729 {
730         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
733 }
734
735 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
736 {
737         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739                 (rt1->rt_mark ^ rt2->rt_mark) |
740                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
741                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
742                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
743 }
744
745 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746 {
747         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
748 }
749
750 static inline int rt_is_expired(struct rtable *rth)
751 {
752         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 }
754
755 /*
756  * Perform a full scan of hash table and free all entries.
757  * Can be called by a softirq or a process.
758  * In the later case, we want to be reschedule if necessary
759  */
760 static void rt_do_flush(struct net *net, int process_context)
761 {
762         unsigned int i;
763         struct rtable *rth, *next;
764
765         for (i = 0; i <= rt_hash_mask; i++) {
766                 struct rtable __rcu **pprev;
767                 struct rtable *list;
768
769                 if (process_context && need_resched())
770                         cond_resched();
771                 rth = rcu_access_pointer(rt_hash_table[i].chain);
772                 if (!rth)
773                         continue;
774
775                 spin_lock_bh(rt_hash_lock_addr(i));
776
777                 list = NULL;
778                 pprev = &rt_hash_table[i].chain;
779                 rth = rcu_dereference_protected(*pprev,
780                         lockdep_is_held(rt_hash_lock_addr(i)));
781
782                 while (rth) {
783                         next = rcu_dereference_protected(rth->dst.rt_next,
784                                 lockdep_is_held(rt_hash_lock_addr(i)));
785
786                         if (!net ||
787                             net_eq(dev_net(rth->dst.dev), net)) {
788                                 rcu_assign_pointer(*pprev, next);
789                                 rcu_assign_pointer(rth->dst.rt_next, list);
790                                 list = rth;
791                         } else {
792                                 pprev = &rth->dst.rt_next;
793                         }
794                         rth = next;
795                 }
796
797                 spin_unlock_bh(rt_hash_lock_addr(i));
798
799                 for (; list; list = next) {
800                         next = rcu_dereference_protected(list->dst.rt_next, 1);
801                         rt_free(list);
802                 }
803         }
804 }
805
806 /*
807  * While freeing expired entries, we compute average chain length
808  * and standard deviation, using fixed-point arithmetic.
809  * This to have an estimation of rt_chain_length_max
810  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
811  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812  */
813
814 #define FRACT_BITS 3
815 #define ONE (1UL << FRACT_BITS)
816
817 /*
818  * Given a hash chain and an item in this hash chain,
819  * find if a previous entry has the same hash_inputs
820  * (but differs on tos, mark or oif)
821  * Returns 0 if an alias is found.
822  * Returns ONE if rth has no alias before itself.
823  */
824 static int has_noalias(const struct rtable *head, const struct rtable *rth)
825 {
826         const struct rtable *aux = head;
827
828         while (aux != rth) {
829                 if (compare_hash_inputs(aux, rth))
830                         return 0;
831                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
832         }
833         return ONE;
834 }
835
836 static void rt_check_expire(void)
837 {
838         static unsigned int rover;
839         unsigned int i = rover, goal;
840         struct rtable *rth;
841         struct rtable __rcu **rthp;
842         unsigned long samples = 0;
843         unsigned long sum = 0, sum2 = 0;
844         unsigned long delta;
845         u64 mult;
846
847         delta = jiffies - expires_ljiffies;
848         expires_ljiffies = jiffies;
849         mult = ((u64)delta) << rt_hash_log;
850         if (ip_rt_gc_timeout > 1)
851                 do_div(mult, ip_rt_gc_timeout);
852         goal = (unsigned int)mult;
853         if (goal > rt_hash_mask)
854                 goal = rt_hash_mask + 1;
855         for (; goal > 0; goal--) {
856                 unsigned long tmo = ip_rt_gc_timeout;
857                 unsigned long length;
858
859                 i = (i + 1) & rt_hash_mask;
860                 rthp = &rt_hash_table[i].chain;
861
862                 if (need_resched())
863                         cond_resched();
864
865                 samples++;
866
867                 if (rcu_dereference_raw(*rthp) == NULL)
868                         continue;
869                 length = 0;
870                 spin_lock_bh(rt_hash_lock_addr(i));
871                 while ((rth = rcu_dereference_protected(*rthp,
872                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873                         prefetch(rth->dst.rt_next);
874                         if (rt_is_expired(rth)) {
875                                 *rthp = rth->dst.rt_next;
876                                 rt_free(rth);
877                                 continue;
878                         }
879                         if (rth->dst.expires) {
880                                 /* Entry is expired even if it is in use */
881                                 if (time_before_eq(jiffies, rth->dst.expires)) {
882 nofree:
883                                         tmo >>= 1;
884                                         rthp = &rth->dst.rt_next;
885                                         /*
886                                          * We only count entries on
887                                          * a chain with equal hash inputs once
888                                          * so that entries for different QOS
889                                          * levels, and other non-hash input
890                                          * attributes don't unfairly skew
891                                          * the length computation
892                                          */
893                                         length += has_noalias(rt_hash_table[i].chain, rth);
894                                         continue;
895                                 }
896                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897                                 goto nofree;
898
899                         /* Cleanup aged off entries. */
900                         *rthp = rth->dst.rt_next;
901                         rt_free(rth);
902                 }
903                 spin_unlock_bh(rt_hash_lock_addr(i));
904                 sum += length;
905                 sum2 += length*length;
906         }
907         if (samples) {
908                 unsigned long avg = sum / samples;
909                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910                 rt_chain_length_max = max_t(unsigned long,
911                                         ip_rt_gc_elasticity,
912                                         (avg + 4*sd) >> FRACT_BITS);
913         }
914         rover = i;
915 }
916
917 /*
918  * rt_worker_func() is run in process context.
919  * we call rt_check_expire() to scan part of the hash table
920  */
921 static void rt_worker_func(struct work_struct *work)
922 {
923         rt_check_expire();
924         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925 }
926
927 /*
928  * Perturbation of rt_genid by a small quantity [1..256]
929  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930  * many times (2^24) without giving recent rt_genid.
931  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
932  */
933 static void rt_cache_invalidate(struct net *net)
934 {
935         unsigned char shuffle;
936
937         get_random_bytes(&shuffle, sizeof(shuffle));
938         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
939         inetpeer_invalidate_tree(AF_INET);
940 }
941
942 /*
943  * delay < 0  : invalidate cache (fast : entries will be deleted later)
944  * delay >= 0 : invalidate & flush cache (can be long)
945  */
946 void rt_cache_flush(struct net *net, int delay)
947 {
948         rt_cache_invalidate(net);
949         if (delay >= 0)
950                 rt_do_flush(net, !in_softirq());
951 }
952
953 /* Flush previous cache invalidated entries from the cache */
954 void rt_cache_flush_batch(struct net *net)
955 {
956         rt_do_flush(net, !in_softirq());
957 }
958
959 static void rt_emergency_hash_rebuild(struct net *net)
960 {
961         if (net_ratelimit())
962                 printk(KERN_WARNING "Route hash chain too long!\n");
963         rt_cache_invalidate(net);
964 }
965
966 /*
967    Short description of GC goals.
968
969    We want to build algorithm, which will keep routing cache
970    at some equilibrium point, when number of aged off entries
971    is kept approximately equal to newly generated ones.
972
973    Current expiration strength is variable "expire".
974    We try to adjust it dynamically, so that if networking
975    is idle expires is large enough to keep enough of warm entries,
976    and when load increases it reduces to limit cache size.
977  */
978
979 static int rt_garbage_collect(struct dst_ops *ops)
980 {
981         static unsigned long expire = RT_GC_TIMEOUT;
982         static unsigned long last_gc;
983         static int rover;
984         static int equilibrium;
985         struct rtable *rth;
986         struct rtable __rcu **rthp;
987         unsigned long now = jiffies;
988         int goal;
989         int entries = dst_entries_get_fast(&ipv4_dst_ops);
990
991         /*
992          * Garbage collection is pretty expensive,
993          * do not make it too frequently.
994          */
995
996         RT_CACHE_STAT_INC(gc_total);
997
998         if (now - last_gc < ip_rt_gc_min_interval &&
999             entries < ip_rt_max_size) {
1000                 RT_CACHE_STAT_INC(gc_ignored);
1001                 goto out;
1002         }
1003
1004         entries = dst_entries_get_slow(&ipv4_dst_ops);
1005         /* Calculate number of entries, which we want to expire now. */
1006         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007         if (goal <= 0) {
1008                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009                         equilibrium = ipv4_dst_ops.gc_thresh;
1010                 goal = entries - equilibrium;
1011                 if (goal > 0) {
1012                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013                         goal = entries - equilibrium;
1014                 }
1015         } else {
1016                 /* We are in dangerous area. Try to reduce cache really
1017                  * aggressively.
1018                  */
1019                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020                 equilibrium = entries - goal;
1021         }
1022
1023         if (now - last_gc >= ip_rt_gc_min_interval)
1024                 last_gc = now;
1025
1026         if (goal <= 0) {
1027                 equilibrium += goal;
1028                 goto work_done;
1029         }
1030
1031         do {
1032                 int i, k;
1033
1034                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035                         unsigned long tmo = expire;
1036
1037                         k = (k + 1) & rt_hash_mask;
1038                         rthp = &rt_hash_table[k].chain;
1039                         spin_lock_bh(rt_hash_lock_addr(k));
1040                         while ((rth = rcu_dereference_protected(*rthp,
1041                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042                                 if (!rt_is_expired(rth) &&
1043                                         !rt_may_expire(rth, tmo, expire)) {
1044                                         tmo >>= 1;
1045                                         rthp = &rth->dst.rt_next;
1046                                         continue;
1047                                 }
1048                                 *rthp = rth->dst.rt_next;
1049                                 rt_free(rth);
1050                                 goal--;
1051                         }
1052                         spin_unlock_bh(rt_hash_lock_addr(k));
1053                         if (goal <= 0)
1054                                 break;
1055                 }
1056                 rover = k;
1057
1058                 if (goal <= 0)
1059                         goto work_done;
1060
1061                 /* Goal is not achieved. We stop process if:
1062
1063                    - if expire reduced to zero. Otherwise, expire is halfed.
1064                    - if table is not full.
1065                    - if we are called from interrupt.
1066                    - jiffies check is just fallback/debug loop breaker.
1067                      We will not spin here for long time in any case.
1068                  */
1069
1070                 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072                 if (expire == 0)
1073                         break;
1074
1075                 expire >>= 1;
1076
1077                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078                         goto out;
1079         } while (!in_softirq() && time_before_eq(jiffies, now));
1080
1081         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084                 goto out;
1085         if (net_ratelimit())
1086                 printk(KERN_WARNING "dst cache overflow\n");
1087         RT_CACHE_STAT_INC(gc_dst_overflow);
1088         return 1;
1089
1090 work_done:
1091         expire += ip_rt_gc_min_interval;
1092         if (expire > ip_rt_gc_timeout ||
1093             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095                 expire = ip_rt_gc_timeout;
1096 out:    return 0;
1097 }
1098
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104         int length = 0;
1105         const struct rtable *rth = head;
1106
1107         while (rth) {
1108                 length += has_noalias(head, rth);
1109                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110         }
1111         return length >> FRACT_BITS;
1112 }
1113
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116         static const __be32 inaddr_any = 0;
1117         struct net_device *dev = dst->dev;
1118         const __be32 *pkey = daddr;
1119         struct neighbour *n;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123
1124         n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1125         if (n)
1126                 return n;
1127         return neigh_create(&arp_tbl, pkey, dev);
1128 }
1129
1130 static int rt_bind_neighbour(struct rtable *rt)
1131 {
1132         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1133         if (IS_ERR(n))
1134                 return PTR_ERR(n);
1135         dst_set_neighbour(&rt->dst, n);
1136
1137         return 0;
1138 }
1139
1140 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1141                                      struct sk_buff *skb, int ifindex)
1142 {
1143         struct rtable   *rth, *cand;
1144         struct rtable __rcu **rthp, **candp;
1145         unsigned long   now;
1146         u32             min_score;
1147         int             chain_length;
1148         int attempts = !in_softirq();
1149
1150 restart:
1151         chain_length = 0;
1152         min_score = ~(u32)0;
1153         cand = NULL;
1154         candp = NULL;
1155         now = jiffies;
1156
1157         if (!rt_caching(dev_net(rt->dst.dev))) {
1158                 /*
1159                  * If we're not caching, just tell the caller we
1160                  * were successful and don't touch the route.  The
1161                  * caller hold the sole reference to the cache entry, and
1162                  * it will be released when the caller is done with it.
1163                  * If we drop it here, the callers have no way to resolve routes
1164                  * when we're not caching.  Instead, just point *rp at rt, so
1165                  * the caller gets a single use out of the route
1166                  * Note that we do rt_free on this new route entry, so that
1167                  * once its refcount hits zero, we are still able to reap it
1168                  * (Thanks Alexey)
1169                  * Note: To avoid expensive rcu stuff for this uncached dst,
1170                  * we set DST_NOCACHE so that dst_release() can free dst without
1171                  * waiting a grace period.
1172                  */
1173
1174                 rt->dst.flags |= DST_NOCACHE;
1175                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1176                         int err = rt_bind_neighbour(rt);
1177                         if (err) {
1178                                 if (net_ratelimit())
1179                                         printk(KERN_WARNING
1180                                             "Neighbour table failure & not caching routes.\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         if (net_ratelimit())
1297                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1298                         rt_drop(rt);
1299                         return ERR_PTR(-ENOBUFS);
1300                 }
1301         }
1302
1303         rt->dst.rt_next = rt_hash_table[hash].chain;
1304
1305         /*
1306          * Since lookup is lockfree, we must make sure
1307          * previous writes to rt are committed to memory
1308          * before making rt visible to other CPUS.
1309          */
1310         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1311
1312         spin_unlock_bh(rt_hash_lock_addr(hash));
1313
1314 skip_hashing:
1315         if (skb)
1316                 skb_dst_set(skb, &rt->dst);
1317         return rt;
1318 }
1319
1320 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1321
1322 static u32 rt_peer_genid(void)
1323 {
1324         return atomic_read(&__rt_peer_genid);
1325 }
1326
1327 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1328 {
1329         struct inet_peer *peer;
1330
1331         peer = inet_getpeer_v4(daddr, create);
1332
1333         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1334                 inet_putpeer(peer);
1335         else
1336                 rt->rt_peer_genid = rt_peer_genid();
1337 }
1338
1339 /*
1340  * Peer allocation may fail only in serious out-of-memory conditions.  However
1341  * we still can generate some output.
1342  * Random ID selection looks a bit dangerous because we have no chances to
1343  * select ID being unique in a reasonable period of time.
1344  * But broken packet identifier may be better than no packet at all.
1345  */
1346 static void ip_select_fb_ident(struct iphdr *iph)
1347 {
1348         static DEFINE_SPINLOCK(ip_fb_id_lock);
1349         static u32 ip_fallback_id;
1350         u32 salt;
1351
1352         spin_lock_bh(&ip_fb_id_lock);
1353         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1354         iph->id = htons(salt & 0xFFFF);
1355         ip_fallback_id = salt;
1356         spin_unlock_bh(&ip_fb_id_lock);
1357 }
1358
1359 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360 {
1361         struct rtable *rt = (struct rtable *) dst;
1362
1363         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1364                 if (rt->peer == NULL)
1365                         rt_bind_peer(rt, rt->rt_dst, 1);
1366
1367                 /* If peer is attached to destination, it is never detached,
1368                    so that we need not to grab a lock to dereference it.
1369                  */
1370                 if (rt->peer) {
1371                         iph->id = htons(inet_getid(rt->peer, more));
1372                         return;
1373                 }
1374         } else if (!rt)
1375                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1376                        __builtin_return_address(0));
1377
1378         ip_select_fb_ident(iph);
1379 }
1380 EXPORT_SYMBOL(__ip_select_ident);
1381
1382 static void rt_del(unsigned hash, struct rtable *rt)
1383 {
1384         struct rtable __rcu **rthp;
1385         struct rtable *aux;
1386
1387         rthp = &rt_hash_table[hash].chain;
1388         spin_lock_bh(rt_hash_lock_addr(hash));
1389         ip_rt_put(rt);
1390         while ((aux = rcu_dereference_protected(*rthp,
1391                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1392                 if (aux == rt || rt_is_expired(aux)) {
1393                         *rthp = aux->dst.rt_next;
1394                         rt_free(aux);
1395                         continue;
1396                 }
1397                 rthp = &aux->dst.rt_next;
1398         }
1399         spin_unlock_bh(rt_hash_lock_addr(hash));
1400 }
1401
1402 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1403 {
1404         struct rtable *rt = (struct rtable *) dst;
1405         __be32 orig_gw = rt->rt_gateway;
1406         struct neighbour *n, *old_n;
1407
1408         dst_confirm(&rt->dst);
1409
1410         rt->rt_gateway = peer->redirect_learned.a4;
1411
1412         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1413         if (IS_ERR(n)) {
1414                 rt->rt_gateway = orig_gw;
1415                 return;
1416         }
1417         old_n = xchg(&rt->dst._neighbour, n);
1418         if (old_n)
1419                 neigh_release(old_n);
1420         if (!(n->nud_state & NUD_VALID)) {
1421                 neigh_event_send(n, NULL);
1422         } else {
1423                 rt->rt_flags |= RTCF_REDIRECTED;
1424                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1425         }
1426 }
1427
1428 /* called in rcu_read_lock() section */
1429 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1430                     __be32 saddr, struct net_device *dev)
1431 {
1432         int s, i;
1433         struct in_device *in_dev = __in_dev_get_rcu(dev);
1434         __be32 skeys[2] = { saddr, 0 };
1435         int    ikeys[2] = { dev->ifindex, 0 };
1436         struct inet_peer *peer;
1437         struct net *net;
1438
1439         if (!in_dev)
1440                 return;
1441
1442         net = dev_net(dev);
1443         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1444             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1445             ipv4_is_zeronet(new_gw))
1446                 goto reject_redirect;
1447
1448         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1449                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1450                         goto reject_redirect;
1451                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1452                         goto reject_redirect;
1453         } else {
1454                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1455                         goto reject_redirect;
1456         }
1457
1458         for (s = 0; s < 2; s++) {
1459                 for (i = 0; i < 2; i++) {
1460                         unsigned int hash;
1461                         struct rtable __rcu **rthp;
1462                         struct rtable *rt;
1463
1464                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1465
1466                         rthp = &rt_hash_table[hash].chain;
1467
1468                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1469                                 rthp = &rt->dst.rt_next;
1470
1471                                 if (rt->rt_key_dst != daddr ||
1472                                     rt->rt_key_src != skeys[s] ||
1473                                     rt->rt_oif != ikeys[i] ||
1474                                     rt_is_input_route(rt) ||
1475                                     rt_is_expired(rt) ||
1476                                     !net_eq(dev_net(rt->dst.dev), net) ||
1477                                     rt->dst.error ||
1478                                     rt->dst.dev != dev ||
1479                                     rt->rt_gateway != old_gw)
1480                                         continue;
1481
1482                                 if (!rt->peer)
1483                                         rt_bind_peer(rt, rt->rt_dst, 1);
1484
1485                                 peer = rt->peer;
1486                                 if (peer) {
1487                                         if (peer->redirect_learned.a4 != new_gw) {
1488                                                 peer->redirect_learned.a4 = new_gw;
1489                                                 atomic_inc(&__rt_peer_genid);
1490                                         }
1491                                         check_peer_redir(&rt->dst, peer);
1492                                 }
1493                         }
1494                 }
1495         }
1496         return;
1497
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502                         "  Advised path = %pI4 -> %pI4\n",
1503                        &old_gw, dev->name, &new_gw,
1504                        &saddr, &daddr);
1505 #endif
1506         ;
1507 }
1508
1509 static bool peer_pmtu_expired(struct inet_peer *peer)
1510 {
1511         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513         return orig &&
1514                time_after_eq(jiffies, orig) &&
1515                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516 }
1517
1518 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519 {
1520         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522         return orig &&
1523                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525
1526 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527 {
1528         struct rtable *rt = (struct rtable *)dst;
1529         struct dst_entry *ret = dst;
1530
1531         if (rt) {
1532                 if (dst->obsolete > 0) {
1533                         ip_rt_put(rt);
1534                         ret = NULL;
1535                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1536                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537                                                 rt->rt_oif,
1538                                                 rt_genid(dev_net(dst->dev)));
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1542                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1543                 }
1544         }
1545         return ret;
1546 }
1547
1548 /*
1549  * Algorithm:
1550  *      1. The first ip_rt_redirect_number redirects are sent
1551  *         with exponential backoff, then we stop sending them at all,
1552  *         assuming that the host ignores our redirects.
1553  *      2. If we did not see packets requiring redirects
1554  *         during ip_rt_redirect_silence, we assume that the host
1555  *         forgot redirected route and start to send redirects again.
1556  *
1557  * This algorithm is much cheaper and more intelligent than dumb load limiting
1558  * in icmp.c.
1559  *
1560  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1561  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1562  */
1563
1564 void ip_rt_send_redirect(struct sk_buff *skb)
1565 {
1566         struct rtable *rt = skb_rtable(skb);
1567         struct in_device *in_dev;
1568         struct inet_peer *peer;
1569         int log_martians;
1570
1571         rcu_read_lock();
1572         in_dev = __in_dev_get_rcu(rt->dst.dev);
1573         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1574                 rcu_read_unlock();
1575                 return;
1576         }
1577         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1578         rcu_read_unlock();
1579
1580         if (!rt->peer)
1581                 rt_bind_peer(rt, rt->rt_dst, 1);
1582         peer = rt->peer;
1583         if (!peer) {
1584                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585                 return;
1586         }
1587
1588         /* No redirected packets during ip_rt_redirect_silence;
1589          * reset the algorithm.
1590          */
1591         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592                 peer->rate_tokens = 0;
1593
1594         /* Too many ignored redirects; do not send anything
1595          * set dst.rate_last to the last seen redirected packet.
1596          */
1597         if (peer->rate_tokens >= ip_rt_redirect_number) {
1598                 peer->rate_last = jiffies;
1599                 return;
1600         }
1601
1602         /* Check for load limit; set rate_last to the latest sent
1603          * redirect.
1604          */
1605         if (peer->rate_tokens == 0 ||
1606             time_after(jiffies,
1607                        (peer->rate_last +
1608                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1609                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610                 peer->rate_last = jiffies;
1611                 ++peer->rate_tokens;
1612 #ifdef CONFIG_IP_ROUTE_VERBOSE
1613                 if (log_martians &&
1614                     peer->rate_tokens == ip_rt_redirect_number &&
1615                     net_ratelimit())
1616                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1617                                &ip_hdr(skb)->saddr, rt->rt_iif,
1618                                 &rt->rt_dst, &rt->rt_gateway);
1619 #endif
1620         }
1621 }
1622
1623 static int ip_error(struct sk_buff *skb)
1624 {
1625         struct rtable *rt = skb_rtable(skb);
1626         struct inet_peer *peer;
1627         unsigned long now;
1628         bool send;
1629         int code;
1630
1631         switch (rt->dst.error) {
1632         case EINVAL:
1633         default:
1634                 goto out;
1635         case EHOSTUNREACH:
1636                 code = ICMP_HOST_UNREACH;
1637                 break;
1638         case ENETUNREACH:
1639                 code = ICMP_NET_UNREACH;
1640                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641                                 IPSTATS_MIB_INNOROUTES);
1642                 break;
1643         case EACCES:
1644                 code = ICMP_PKT_FILTERED;
1645                 break;
1646         }
1647
1648         if (!rt->peer)
1649                 rt_bind_peer(rt, rt->rt_dst, 1);
1650         peer = rt->peer;
1651
1652         send = true;
1653         if (peer) {
1654                 now = jiffies;
1655                 peer->rate_tokens += now - peer->rate_last;
1656                 if (peer->rate_tokens > ip_rt_error_burst)
1657                         peer->rate_tokens = ip_rt_error_burst;
1658                 peer->rate_last = now;
1659                 if (peer->rate_tokens >= ip_rt_error_cost)
1660                         peer->rate_tokens -= ip_rt_error_cost;
1661                 else
1662                         send = false;
1663         }
1664         if (send)
1665                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667 out:    kfree_skb(skb);
1668         return 0;
1669 }
1670
1671 /*
1672  *      The last two values are not from the RFC but
1673  *      are needed for AMPRnet AX.25 paths.
1674  */
1675
1676 static const unsigned short mtu_plateau[] =
1677 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
1679 static inline unsigned short guess_mtu(unsigned short old_mtu)
1680 {
1681         int i;
1682
1683         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684                 if (old_mtu > mtu_plateau[i])
1685                         return mtu_plateau[i];
1686         return 68;
1687 }
1688
1689 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690                                  unsigned short new_mtu,
1691                                  struct net_device *dev)
1692 {
1693         unsigned short old_mtu = ntohs(iph->tot_len);
1694         unsigned short est_mtu = 0;
1695         struct inet_peer *peer;
1696
1697         peer = inet_getpeer_v4(iph->daddr, 1);
1698         if (peer) {
1699                 unsigned short mtu = new_mtu;
1700
1701                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702                         /* BSD 4.2 derived systems incorrectly adjust
1703                          * tot_len by the IP header length, and report
1704                          * a zero MTU in the ICMP message.
1705                          */
1706                         if (mtu == 0 &&
1707                             old_mtu >= 68 + (iph->ihl << 2))
1708                                 old_mtu -= iph->ihl << 2;
1709                         mtu = guess_mtu(old_mtu);
1710                 }
1711
1712                 if (mtu < ip_rt_min_pmtu)
1713                         mtu = ip_rt_min_pmtu;
1714                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715                         unsigned long pmtu_expires;
1716
1717                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1718                         if (!pmtu_expires)
1719                                 pmtu_expires = 1UL;
1720
1721                         est_mtu = mtu;
1722                         peer->pmtu_learned = mtu;
1723                         peer->pmtu_expires = pmtu_expires;
1724                         atomic_inc(&__rt_peer_genid);
1725                 }
1726
1727                 inet_putpeer(peer);
1728         }
1729         return est_mtu ? : new_mtu;
1730 }
1731
1732 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733 {
1734         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735
1736         if (!expires)
1737                 return;
1738         if (time_before(jiffies, expires)) {
1739                 u32 orig_dst_mtu = dst_mtu(dst);
1740                 if (peer->pmtu_learned < orig_dst_mtu) {
1741                         if (!peer->pmtu_orig)
1742                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744                 }
1745         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747 }
1748
1749 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750 {
1751         struct rtable *rt = (struct rtable *) dst;
1752         struct inet_peer *peer;
1753
1754         dst_confirm(dst);
1755
1756         if (!rt->peer)
1757                 rt_bind_peer(rt, rt->rt_dst, 1);
1758         peer = rt->peer;
1759         if (peer) {
1760                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762                 if (mtu < ip_rt_min_pmtu)
1763                         mtu = ip_rt_min_pmtu;
1764                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1767                         if (!pmtu_expires)
1768                                 pmtu_expires = 1UL;
1769
1770                         peer->pmtu_learned = mtu;
1771                         peer->pmtu_expires = pmtu_expires;
1772
1773                         atomic_inc(&__rt_peer_genid);
1774                         rt->rt_peer_genid = rt_peer_genid();
1775                 }
1776                 check_peer_pmtu(dst, peer);
1777         }
1778 }
1779
1780
1781 static void ipv4_validate_peer(struct rtable *rt)
1782 {
1783         if (rt->rt_peer_genid != rt_peer_genid()) {
1784                 struct inet_peer *peer;
1785
1786                 if (!rt->peer)
1787                         rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789                 peer = rt->peer;
1790                 if (peer) {
1791                         check_peer_pmtu(&rt->dst, peer);
1792
1793                         if (peer->redirect_learned.a4 &&
1794                             peer->redirect_learned.a4 != rt->rt_gateway)
1795                                 check_peer_redir(&rt->dst, peer);
1796                 }
1797
1798                 rt->rt_peer_genid = rt_peer_genid();
1799         }
1800 }
1801
1802 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803 {
1804         struct rtable *rt = (struct rtable *) dst;
1805
1806         if (rt_is_expired(rt))
1807                 return NULL;
1808         ipv4_validate_peer(rt);
1809         return dst;
1810 }
1811
1812 static void ipv4_dst_destroy(struct dst_entry *dst)
1813 {
1814         struct rtable *rt = (struct rtable *) dst;
1815         struct inet_peer *peer = rt->peer;
1816
1817         if (rt->fi) {
1818                 fib_info_put(rt->fi);
1819                 rt->fi = NULL;
1820         }
1821         if (peer) {
1822                 rt->peer = NULL;
1823                 inet_putpeer(peer);
1824         }
1825 }
1826
1827
1828 static void ipv4_link_failure(struct sk_buff *skb)
1829 {
1830         struct rtable *rt;
1831
1832         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834         rt = skb_rtable(skb);
1835         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837 }
1838
1839 static int ip_rt_bug(struct sk_buff *skb)
1840 {
1841         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1842                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843                 skb->dev ? skb->dev->name : "?");
1844         kfree_skb(skb);
1845         WARN_ON(1);
1846         return 0;
1847 }
1848
1849 /*
1850    We do not cache source address of outgoing interface,
1851    because it is used only by IP RR, TS and SRR options,
1852    so that it out of fast path.
1853
1854    BTW remember: "addr" is allowed to be not aligned
1855    in IP options!
1856  */
1857
1858 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859 {
1860         __be32 src;
1861
1862         if (rt_is_output_route(rt))
1863                 src = ip_hdr(skb)->saddr;
1864         else {
1865                 struct fib_result res;
1866                 struct flowi4 fl4;
1867                 struct iphdr *iph;
1868
1869                 iph = ip_hdr(skb);
1870
1871                 memset(&fl4, 0, sizeof(fl4));
1872                 fl4.daddr = iph->daddr;
1873                 fl4.saddr = iph->saddr;
1874                 fl4.flowi4_tos = RT_TOS(iph->tos);
1875                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1876                 fl4.flowi4_iif = skb->dev->ifindex;
1877                 fl4.flowi4_mark = skb->mark;
1878
1879                 rcu_read_lock();
1880                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882                 else
1883                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884                                         RT_SCOPE_UNIVERSE);
1885                 rcu_read_unlock();
1886         }
1887         memcpy(addr, &src, 4);
1888 }
1889
1890 #ifdef CONFIG_IP_ROUTE_CLASSID
1891 static void set_class_tag(struct rtable *rt, u32 tag)
1892 {
1893         if (!(rt->dst.tclassid & 0xFFFF))
1894                 rt->dst.tclassid |= tag & 0xFFFF;
1895         if (!(rt->dst.tclassid & 0xFFFF0000))
1896                 rt->dst.tclassid |= tag & 0xFFFF0000;
1897 }
1898 #endif
1899
1900 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901 {
1902         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903
1904         if (advmss == 0) {
1905                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906                                ip_rt_min_advmss);
1907                 if (advmss > 65535 - 40)
1908                         advmss = 65535 - 40;
1909         }
1910         return advmss;
1911 }
1912
1913 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914 {
1915         const struct rtable *rt = (const struct rtable *) dst;
1916         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917
1918         if (mtu && rt_is_output_route(rt))
1919                 return mtu;
1920
1921         mtu = dst->dev->mtu;
1922
1923         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926                         mtu = 576;
1927         }
1928
1929         if (mtu > IP_MAX_MTU)
1930                 mtu = IP_MAX_MTU;
1931
1932         return mtu;
1933 }
1934
1935 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936                             struct fib_info *fi)
1937 {
1938         struct inet_peer *peer;
1939         int create = 0;
1940
1941         /* If a peer entry exists for this destination, we must hook
1942          * it up in order to get at cached metrics.
1943          */
1944         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945                 create = 1;
1946
1947         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948         if (peer) {
1949                 rt->rt_peer_genid = rt_peer_genid();
1950                 if (inet_metrics_new(peer))
1951                         memcpy(peer->metrics, fi->fib_metrics,
1952                                sizeof(u32) * RTAX_MAX);
1953                 dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955                 check_peer_pmtu(&rt->dst, peer);
1956
1957                 if (peer->redirect_learned.a4 &&
1958                     peer->redirect_learned.a4 != rt->rt_gateway) {
1959                         rt->rt_gateway = peer->redirect_learned.a4;
1960                         rt->rt_flags |= RTCF_REDIRECTED;
1961                 }
1962         } else {
1963                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964                         rt->fi = fi;
1965                         atomic_inc(&fi->fib_clntref);
1966                 }
1967                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968         }
1969 }
1970
1971 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972                            const struct fib_result *res,
1973                            struct fib_info *fi, u16 type, u32 itag)
1974 {
1975         struct dst_entry *dst = &rt->dst;
1976
1977         if (fi) {
1978                 if (FIB_RES_GW(*res) &&
1979                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980                         rt->rt_gateway = FIB_RES_GW(*res);
1981                 rt_init_metrics(rt, fl4, fi);
1982 #ifdef CONFIG_IP_ROUTE_CLASSID
1983                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984 #endif
1985         }
1986
1987         if (dst_mtu(dst) > IP_MAX_MTU)
1988                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992 #ifdef CONFIG_IP_ROUTE_CLASSID
1993 #ifdef CONFIG_IP_MULTIPLE_TABLES
1994         set_class_tag(rt, fib_rules_tclass(res));
1995 #endif
1996         set_class_tag(rt, itag);
1997 #endif
1998 }
1999
2000 static struct rtable *rt_dst_alloc(struct net_device *dev,
2001                                    bool nopolicy, bool noxfrm)
2002 {
2003         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004                          DST_HOST |
2005                          (nopolicy ? DST_NOPOLICY : 0) |
2006                          (noxfrm ? DST_NOXFRM : 0));
2007 }
2008
2009 /* called in rcu_read_lock() section */
2010 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011                                 u8 tos, struct net_device *dev, int our)
2012 {
2013         unsigned int hash;
2014         struct rtable *rth;
2015         __be32 spec_dst;
2016         struct in_device *in_dev = __in_dev_get_rcu(dev);
2017         u32 itag = 0;
2018         int err;
2019
2020         /* Primary sanity checks. */
2021
2022         if (in_dev == NULL)
2023                 return -EINVAL;
2024
2025         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027                 goto e_inval;
2028
2029         if (ipv4_is_zeronet(saddr)) {
2030                 if (!ipv4_is_local_multicast(daddr))
2031                         goto e_inval;
2032                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033         } else {
2034                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035                                           &itag);
2036                 if (err < 0)
2037                         goto e_err;
2038         }
2039         rth = rt_dst_alloc(init_net.loopback_dev,
2040                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041         if (!rth)
2042                 goto e_nobufs;
2043
2044 #ifdef CONFIG_IP_ROUTE_CLASSID
2045         rth->dst.tclassid = itag;
2046 #endif
2047         rth->dst.output = ip_rt_bug;
2048
2049         rth->rt_key_dst = daddr;
2050         rth->rt_key_src = saddr;
2051         rth->rt_genid   = rt_genid(dev_net(dev));
2052         rth->rt_flags   = RTCF_MULTICAST;
2053         rth->rt_type    = RTN_MULTICAST;
2054         rth->rt_key_tos = tos;
2055         rth->rt_dst     = daddr;
2056         rth->rt_src     = saddr;
2057         rth->rt_route_iif = dev->ifindex;
2058         rth->rt_iif     = dev->ifindex;
2059         rth->rt_oif     = 0;
2060         rth->rt_mark    = skb->mark;
2061         rth->rt_gateway = daddr;
2062         rth->rt_spec_dst= spec_dst;
2063         rth->rt_peer_genid = 0;
2064         rth->peer = NULL;
2065         rth->fi = NULL;
2066         if (our) {
2067                 rth->dst.input= ip_local_deliver;
2068                 rth->rt_flags |= RTCF_LOCAL;
2069         }
2070
2071 #ifdef CONFIG_IP_MROUTE
2072         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073                 rth->dst.input = ip_mr_input;
2074 #endif
2075         RT_CACHE_STAT_INC(in_slow_mc);
2076
2077         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081 e_nobufs:
2082         return -ENOBUFS;
2083 e_inval:
2084         return -EINVAL;
2085 e_err:
2086         return err;
2087 }
2088
2089
2090 static void ip_handle_martian_source(struct net_device *dev,
2091                                      struct in_device *in_dev,
2092                                      struct sk_buff *skb,
2093                                      __be32 daddr,
2094                                      __be32 saddr)
2095 {
2096         RT_CACHE_STAT_INC(in_martian_src);
2097 #ifdef CONFIG_IP_ROUTE_VERBOSE
2098         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099                 /*
2100                  *      RFC1812 recommendation, if source is martian,
2101                  *      the only hint is MAC header.
2102                  */
2103                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2104                         &daddr, &saddr, dev->name);
2105                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106                         int i;
2107                         const unsigned char *p = skb_mac_header(skb);
2108                         printk(KERN_WARNING "ll header: ");
2109                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2110                                 printk("%02x", *p);
2111                                 if (i < (dev->hard_header_len - 1))
2112                                         printk(":");
2113                         }
2114                         printk("\n");
2115                 }
2116         }
2117 #endif
2118 }
2119
2120 /* called in rcu_read_lock() section */
2121 static int __mkroute_input(struct sk_buff *skb,
2122                            const struct fib_result *res,
2123                            struct in_device *in_dev,
2124                            __be32 daddr, __be32 saddr, u32 tos,
2125                            struct rtable **result)
2126 {
2127         struct rtable *rth;
2128         int err;
2129         struct in_device *out_dev;
2130         unsigned int flags = 0;
2131         __be32 spec_dst;
2132         u32 itag;
2133
2134         /* get a working reference to the output device */
2135         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136         if (out_dev == NULL) {
2137                 if (net_ratelimit())
2138                         printk(KERN_CRIT "Bug in ip_route_input" \
2139                                "_slow(). Please, report\n");
2140                 return -EINVAL;
2141         }
2142
2143
2144         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145                                   in_dev->dev, &spec_dst, &itag);
2146         if (err < 0) {
2147                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148                                          saddr);
2149
2150                 goto cleanup;
2151         }
2152
2153         if (err)
2154                 flags |= RTCF_DIRECTSRC;
2155
2156         if (out_dev == in_dev && err &&
2157             (IN_DEV_SHARED_MEDIA(out_dev) ||
2158              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159                 flags |= RTCF_DOREDIRECT;
2160
2161         if (skb->protocol != htons(ETH_P_IP)) {
2162                 /* Not IP (i.e. ARP). Do not create route, if it is
2163                  * invalid for proxy arp. DNAT routes are always valid.
2164                  *
2165                  * Proxy arp feature have been extended to allow, ARP
2166                  * replies back to the same interface, to support
2167                  * Private VLAN switch technologies. See arp.c.
2168                  */
2169                 if (out_dev == in_dev &&
2170                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171                         err = -EINVAL;
2172                         goto cleanup;
2173                 }
2174         }
2175
2176         rth = rt_dst_alloc(out_dev->dev,
2177                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2179         if (!rth) {
2180                 err = -ENOBUFS;
2181                 goto cleanup;
2182         }
2183
2184         rth->rt_key_dst = daddr;
2185         rth->rt_key_src = saddr;
2186         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187         rth->rt_flags = flags;
2188         rth->rt_type = res->type;
2189         rth->rt_key_tos = tos;
2190         rth->rt_dst     = daddr;
2191         rth->rt_src     = saddr;
2192         rth->rt_route_iif = in_dev->dev->ifindex;
2193         rth->rt_iif     = in_dev->dev->ifindex;
2194         rth->rt_oif     = 0;
2195         rth->rt_mark    = skb->mark;
2196         rth->rt_gateway = daddr;
2197         rth->rt_spec_dst= spec_dst;
2198         rth->rt_peer_genid = 0;
2199         rth->peer = NULL;
2200         rth->fi = NULL;
2201
2202         rth->dst.input = ip_forward;
2203         rth->dst.output = ip_output;
2204
2205         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206
2207         *result = rth;
2208         err = 0;
2209  cleanup:
2210         return err;
2211 }
2212
2213 static int ip_mkroute_input(struct sk_buff *skb,
2214                             struct fib_result *res,
2215                             const struct flowi4 *fl4,
2216                             struct in_device *in_dev,
2217                             __be32 daddr, __be32 saddr, u32 tos)
2218 {
2219         struct rtable* rth = NULL;
2220         int err;
2221         unsigned hash;
2222
2223 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2224         if (res->fi && res->fi->fib_nhs > 1)
2225                 fib_select_multipath(res);
2226 #endif
2227
2228         /* create a routing cache entry */
2229         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230         if (err)
2231                 return err;
2232
2233         /* put it into the cache */
2234         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235                        rt_genid(dev_net(rth->dst.dev)));
2236         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237         if (IS_ERR(rth))
2238                 return PTR_ERR(rth);
2239         return 0;
2240 }
2241
2242 /*
2243  *      NOTE. We drop all the packets that has local source
2244  *      addresses, because every properly looped back packet
2245  *      must have correct destination already attached by output routine.
2246  *
2247  *      Such approach solves two big problems:
2248  *      1. Not simplex devices are handled properly.
2249  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2250  *      called with rcu_read_lock()
2251  */
2252
2253 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254                                u8 tos, struct net_device *dev)
2255 {
2256         struct fib_result res;
2257         struct in_device *in_dev = __in_dev_get_rcu(dev);
2258         struct flowi4   fl4;
2259         unsigned        flags = 0;
2260         u32             itag = 0;
2261         struct rtable * rth;
2262         unsigned        hash;
2263         __be32          spec_dst;
2264         int             err = -EINVAL;
2265         struct net    * net = dev_net(dev);
2266
2267         /* IP on this device is disabled. */
2268
2269         if (!in_dev)
2270                 goto out;
2271
2272         /* Check for the most weird martians, which can be not detected
2273            by fib_lookup.
2274          */
2275
2276         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277             ipv4_is_loopback(saddr))
2278                 goto martian_source;
2279
2280         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281                 goto brd_input;
2282
2283         /* Accept zero addresses only to limited broadcast;
2284          * I even do not know to fix it or not. Waiting for complains :-)
2285          */
2286         if (ipv4_is_zeronet(saddr))
2287                 goto martian_source;
2288
2289         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290                 goto martian_destination;
2291
2292         /*
2293          *      Now we are ready to route packet.
2294          */
2295         fl4.flowi4_oif = 0;
2296         fl4.flowi4_iif = dev->ifindex;
2297         fl4.flowi4_mark = skb->mark;
2298         fl4.flowi4_tos = tos;
2299         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300         fl4.daddr = daddr;
2301         fl4.saddr = saddr;
2302         err = fib_lookup(net, &fl4, &res);
2303         if (err != 0) {
2304                 if (!IN_DEV_FORWARD(in_dev))
2305                         goto e_hostunreach;
2306                 goto no_route;
2307         }
2308
2309         RT_CACHE_STAT_INC(in_slow_tot);
2310
2311         if (res.type == RTN_BROADCAST)
2312                 goto brd_input;
2313
2314         if (res.type == RTN_LOCAL) {
2315                 err = fib_validate_source(skb, saddr, daddr, tos,
2316                                           net->loopback_dev->ifindex,
2317                                           dev, &spec_dst, &itag);
2318                 if (err < 0)
2319                         goto martian_source_keep_err;
2320                 if (err)
2321                         flags |= RTCF_DIRECTSRC;
2322                 spec_dst = daddr;
2323                 goto local_input;
2324         }
2325
2326         if (!IN_DEV_FORWARD(in_dev))
2327                 goto e_hostunreach;
2328         if (res.type != RTN_UNICAST)
2329                 goto martian_destination;
2330
2331         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332 out:    return err;
2333
2334 brd_input:
2335         if (skb->protocol != htons(ETH_P_IP))
2336                 goto e_inval;
2337
2338         if (ipv4_is_zeronet(saddr))
2339                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340         else {
2341                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342                                           &itag);
2343                 if (err < 0)
2344                         goto martian_source_keep_err;
2345                 if (err)
2346                         flags |= RTCF_DIRECTSRC;
2347         }
2348         flags |= RTCF_BROADCAST;
2349         res.type = RTN_BROADCAST;
2350         RT_CACHE_STAT_INC(in_brd);
2351
2352 local_input:
2353         rth = rt_dst_alloc(net->loopback_dev,
2354                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355         if (!rth)
2356                 goto e_nobufs;
2357
2358         rth->dst.input= ip_local_deliver;
2359         rth->dst.output= ip_rt_bug;
2360 #ifdef CONFIG_IP_ROUTE_CLASSID
2361         rth->dst.tclassid = itag;
2362 #endif
2363
2364         rth->rt_key_dst = daddr;
2365         rth->rt_key_src = saddr;
2366         rth->rt_genid = rt_genid(net);
2367         rth->rt_flags   = flags|RTCF_LOCAL;
2368         rth->rt_type    = res.type;
2369         rth->rt_key_tos = tos;
2370         rth->rt_dst     = daddr;
2371         rth->rt_src     = saddr;
2372 #ifdef CONFIG_IP_ROUTE_CLASSID
2373         rth->dst.tclassid = itag;
2374 #endif
2375         rth->rt_route_iif = dev->ifindex;
2376         rth->rt_iif     = dev->ifindex;
2377         rth->rt_oif     = 0;
2378         rth->rt_mark    = skb->mark;
2379         rth->rt_gateway = daddr;
2380         rth->rt_spec_dst= spec_dst;
2381         rth->rt_peer_genid = 0;
2382         rth->peer = NULL;
2383         rth->fi = NULL;
2384         if (res.type == RTN_UNREACHABLE) {
2385                 rth->dst.input= ip_error;
2386                 rth->dst.error= -err;
2387                 rth->rt_flags   &= ~RTCF_LOCAL;
2388         }
2389         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391         err = 0;
2392         if (IS_ERR(rth))
2393                 err = PTR_ERR(rth);
2394         goto out;
2395
2396 no_route:
2397         RT_CACHE_STAT_INC(in_no_route);
2398         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399         res.type = RTN_UNREACHABLE;
2400         if (err == -ESRCH)
2401                 err = -ENETUNREACH;
2402         goto local_input;
2403
2404         /*
2405          *      Do not cache martian addresses: they should be logged (RFC1812)
2406          */
2407 martian_destination:
2408         RT_CACHE_STAT_INC(in_martian_dst);
2409 #ifdef CONFIG_IP_ROUTE_VERBOSE
2410         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2412                         &daddr, &saddr, dev->name);
2413 #endif
2414
2415 e_hostunreach:
2416         err = -EHOSTUNREACH;
2417         goto out;
2418
2419 e_inval:
2420         err = -EINVAL;
2421         goto out;
2422
2423 e_nobufs:
2424         err = -ENOBUFS;
2425         goto out;
2426
2427 martian_source:
2428         err = -EINVAL;
2429 martian_source_keep_err:
2430         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431         goto out;
2432 }
2433
2434 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435                            u8 tos, struct net_device *dev, bool noref)
2436 {
2437         struct rtable * rth;
2438         unsigned        hash;
2439         int iif = dev->ifindex;
2440         struct net *net;
2441         int res;
2442
2443         net = dev_net(dev);
2444
2445         rcu_read_lock();
2446
2447         if (!rt_caching(net))
2448                 goto skip_cache;
2449
2450         tos &= IPTOS_RT_MASK;
2451         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452
2453         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454              rth = rcu_dereference(rth->dst.rt_next)) {
2455                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457                      (rth->rt_route_iif ^ iif) |
2458                      (rth->rt_key_tos ^ tos)) == 0 &&
2459                     rth->rt_mark == skb->mark &&
2460                     net_eq(dev_net(rth->dst.dev), net) &&
2461                     !rt_is_expired(rth)) {
2462                         ipv4_validate_peer(rth);
2463                         if (noref) {
2464                                 dst_use_noref(&rth->dst, jiffies);
2465                                 skb_dst_set_noref(skb, &rth->dst);
2466                         } else {
2467                                 dst_use(&rth->dst, jiffies);
2468                                 skb_dst_set(skb, &rth->dst);
2469                         }
2470                         RT_CACHE_STAT_INC(in_hit);
2471                         rcu_read_unlock();
2472                         return 0;
2473                 }
2474                 RT_CACHE_STAT_INC(in_hlist_search);
2475         }
2476
2477 skip_cache:
2478         /* Multicast recognition logic is moved from route cache to here.
2479            The problem was that too many Ethernet cards have broken/missing
2480            hardware multicast filters :-( As result the host on multicasting
2481            network acquires a lot of useless route cache entries, sort of
2482            SDR messages from all the world. Now we try to get rid of them.
2483            Really, provided software IP multicast filter is organized
2484            reasonably (at least, hashed), it does not result in a slowdown
2485            comparing with route cache reject entries.
2486            Note, that multicast routers are not affected, because
2487            route cache entry is created eventually.
2488          */
2489         if (ipv4_is_multicast(daddr)) {
2490                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2491
2492                 if (in_dev) {
2493                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494                                                   ip_hdr(skb)->protocol);
2495                         if (our
2496 #ifdef CONFIG_IP_MROUTE
2497                                 ||
2498                             (!ipv4_is_local_multicast(daddr) &&
2499                              IN_DEV_MFORWARD(in_dev))
2500 #endif
2501                            ) {
2502                                 int res = ip_route_input_mc(skb, daddr, saddr,
2503                                                             tos, dev, our);
2504                                 rcu_read_unlock();
2505                                 return res;
2506                         }
2507                 }
2508                 rcu_read_unlock();
2509                 return -EINVAL;
2510         }
2511         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512         rcu_read_unlock();
2513         return res;
2514 }
2515 EXPORT_SYMBOL(ip_route_input_common);
2516
2517 /* called with rcu_read_lock() */
2518 static struct rtable *__mkroute_output(const struct fib_result *res,
2519                                        const struct flowi4 *fl4,
2520                                        __be32 orig_daddr, __be32 orig_saddr,
2521                                        int orig_oif, __u8 orig_rtos,
2522                                        struct net_device *dev_out,
2523                                        unsigned int flags)
2524 {
2525         struct fib_info *fi = res->fi;
2526         struct in_device *in_dev;
2527         u16 type = res->type;
2528         struct rtable *rth;
2529
2530         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531                 return ERR_PTR(-EINVAL);
2532
2533         if (ipv4_is_lbcast(fl4->daddr))
2534                 type = RTN_BROADCAST;
2535         else if (ipv4_is_multicast(fl4->daddr))
2536                 type = RTN_MULTICAST;
2537         else if (ipv4_is_zeronet(fl4->daddr))
2538                 return ERR_PTR(-EINVAL);
2539
2540         if (dev_out->flags & IFF_LOOPBACK)
2541                 flags |= RTCF_LOCAL;
2542
2543         in_dev = __in_dev_get_rcu(dev_out);
2544         if (!in_dev)
2545                 return ERR_PTR(-EINVAL);
2546
2547         if (type == RTN_BROADCAST) {
2548                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549                 fi = NULL;
2550         } else if (type == RTN_MULTICAST) {
2551                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553                                      fl4->flowi4_proto))
2554                         flags &= ~RTCF_LOCAL;
2555                 /* If multicast route do not exist use
2556                  * default one, but do not gateway in this case.
2557                  * Yes, it is hack.
2558                  */
2559                 if (fi && res->prefixlen < 4)
2560                         fi = NULL;
2561         }
2562
2563         rth = rt_dst_alloc(dev_out,
2564                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2566         if (!rth)
2567                 return ERR_PTR(-ENOBUFS);
2568
2569         rth->dst.output = ip_output;
2570
2571         rth->rt_key_dst = orig_daddr;
2572         rth->rt_key_src = orig_saddr;
2573         rth->rt_genid = rt_genid(dev_net(dev_out));
2574         rth->rt_flags   = flags;
2575         rth->rt_type    = type;
2576         rth->rt_key_tos = orig_rtos;
2577         rth->rt_dst     = fl4->daddr;
2578         rth->rt_src     = fl4->saddr;
2579         rth->rt_route_iif = 0;
2580         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2581         rth->rt_oif     = orig_oif;
2582         rth->rt_mark    = fl4->flowi4_mark;
2583         rth->rt_gateway = fl4->daddr;
2584         rth->rt_spec_dst= fl4->saddr;
2585         rth->rt_peer_genid = 0;
2586         rth->peer = NULL;
2587         rth->fi = NULL;
2588
2589         RT_CACHE_STAT_INC(out_slow_tot);
2590
2591         if (flags & RTCF_LOCAL) {
2592                 rth->dst.input = ip_local_deliver;
2593                 rth->rt_spec_dst = fl4->daddr;
2594         }
2595         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596                 rth->rt_spec_dst = fl4->saddr;
2597                 if (flags & RTCF_LOCAL &&
2598                     !(dev_out->flags & IFF_LOOPBACK)) {
2599                         rth->dst.output = ip_mc_output;
2600                         RT_CACHE_STAT_INC(out_slow_mc);
2601                 }
2602 #ifdef CONFIG_IP_MROUTE
2603                 if (type == RTN_MULTICAST) {
2604                         if (IN_DEV_MFORWARD(in_dev) &&
2605                             !ipv4_is_local_multicast(fl4->daddr)) {
2606                                 rth->dst.input = ip_mr_input;
2607                                 rth->dst.output = ip_mc_output;
2608                         }
2609                 }
2610 #endif
2611         }
2612
2613         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614
2615         return rth;
2616 }
2617
2618 /*
2619  * Major route resolver routine.
2620  * called with rcu_read_lock();
2621  */
2622
2623 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624 {
2625         struct net_device *dev_out = NULL;
2626         __u8 tos = RT_FL_TOS(fl4);
2627         unsigned int flags = 0;
2628         struct fib_result res;
2629         struct rtable *rth;
2630         __be32 orig_daddr;
2631         __be32 orig_saddr;
2632         int orig_oif;
2633
2634         res.fi          = NULL;
2635 #ifdef CONFIG_IP_MULTIPLE_TABLES
2636         res.r           = NULL;
2637 #endif
2638
2639         orig_daddr = fl4->daddr;
2640         orig_saddr = fl4->saddr;
2641         orig_oif = fl4->flowi4_oif;
2642
2643         fl4->flowi4_iif = net->loopback_dev->ifindex;
2644         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647
2648         rcu_read_lock();
2649         if (fl4->saddr) {
2650                 rth = ERR_PTR(-EINVAL);
2651                 if (ipv4_is_multicast(fl4->saddr) ||
2652                     ipv4_is_lbcast(fl4->saddr) ||
2653                     ipv4_is_zeronet(fl4->saddr))
2654                         goto out;
2655
2656                 /* I removed check for oif == dev_out->oif here.
2657                    It was wrong for two reasons:
2658                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659                       is assigned to multiple interfaces.
2660                    2. Moreover, we are allowed to send packets with saddr
2661                       of another iface. --ANK
2662                  */
2663
2664                 if (fl4->flowi4_oif == 0 &&
2665                     (ipv4_is_multicast(fl4->daddr) ||
2666                      ipv4_is_lbcast(fl4->daddr))) {
2667                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2668                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2669                         if (dev_out == NULL)
2670                                 goto out;
2671
2672                         /* Special hack: user can direct multicasts
2673                            and limited broadcast via necessary interface
2674                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675                            This hack is not just for fun, it allows
2676                            vic,vat and friends to work.
2677                            They bind socket to loopback, set ttl to zero
2678                            and expect that it will work.
2679                            From the viewpoint of routing cache they are broken,
2680                            because we are not allowed to build multicast path
2681                            with loopback source addr (look, routing cache
2682                            cannot know, that ttl is zero, so that packet
2683                            will not leave this host and route is valid).
2684                            Luckily, this hack is good workaround.
2685                          */
2686
2687                         fl4->flowi4_oif = dev_out->ifindex;
2688                         goto make_route;
2689                 }
2690
2691                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2693                         if (!__ip_dev_find(net, fl4->saddr, false))
2694                                 goto out;
2695                 }
2696         }
2697
2698
2699         if (fl4->flowi4_oif) {
2700                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701                 rth = ERR_PTR(-ENODEV);
2702                 if (dev_out == NULL)
2703                         goto out;
2704
2705                 /* RACE: Check return value of inet_select_addr instead. */
2706                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707                         rth = ERR_PTR(-ENETUNREACH);
2708                         goto out;
2709                 }
2710                 if (ipv4_is_local_multicast(fl4->daddr) ||
2711                     ipv4_is_lbcast(fl4->daddr)) {
2712                         if (!fl4->saddr)
2713                                 fl4->saddr = inet_select_addr(dev_out, 0,
2714                                                               RT_SCOPE_LINK);
2715                         goto make_route;
2716                 }
2717                 if (fl4->saddr) {
2718                         if (ipv4_is_multicast(fl4->daddr))
2719                                 fl4->saddr = inet_select_addr(dev_out, 0,
2720                                                               fl4->flowi4_scope);
2721                         else if (!fl4->daddr)
2722                                 fl4->saddr = inet_select_addr(dev_out, 0,
2723                                                               RT_SCOPE_HOST);
2724                 }
2725         }
2726
2727         if (!fl4->daddr) {
2728                 fl4->daddr = fl4->saddr;
2729                 if (!fl4->daddr)
2730                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731                 dev_out = net->loopback_dev;
2732                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2733                 res.type = RTN_LOCAL;
2734                 flags |= RTCF_LOCAL;
2735                 goto make_route;
2736         }
2737
2738         if (fib_lookup(net, fl4, &res)) {
2739                 res.fi = NULL;
2740                 if (fl4->flowi4_oif) {
2741                         /* Apparently, routing tables are wrong. Assume,
2742                            that the destination is on link.
2743
2744                            WHY? DW.
2745                            Because we are allowed to send to iface
2746                            even if it has NO routes and NO assigned
2747                            addresses. When oif is specified, routing
2748                            tables are looked up with only one purpose:
2749                            to catch if destination is gatewayed, rather than
2750                            direct. Moreover, if MSG_DONTROUTE is set,
2751                            we send packet, ignoring both routing tables
2752                            and ifaddr state. --ANK
2753
2754
2755                            We could make it even if oif is unknown,
2756                            likely IPv6, but we do not.
2757                          */
2758
2759                         if (fl4->saddr == 0)
2760                                 fl4->saddr = inet_select_addr(dev_out, 0,
2761                                                               RT_SCOPE_LINK);
2762                         res.type = RTN_UNICAST;
2763                         goto make_route;
2764                 }
2765                 rth = ERR_PTR(-ENETUNREACH);
2766                 goto out;
2767         }
2768
2769         if (res.type == RTN_LOCAL) {
2770                 if (!fl4->saddr) {
2771                         if (res.fi->fib_prefsrc)
2772                                 fl4->saddr = res.fi->fib_prefsrc;
2773                         else
2774                                 fl4->saddr = fl4->daddr;
2775                 }
2776                 dev_out = net->loopback_dev;
2777                 fl4->flowi4_oif = dev_out->ifindex;
2778                 res.fi = NULL;
2779                 flags |= RTCF_LOCAL;
2780                 goto make_route;
2781         }
2782
2783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2784         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785                 fib_select_multipath(&res);
2786         else
2787 #endif
2788         if (!res.prefixlen &&
2789             res.table->tb_num_default > 1 &&
2790             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791                 fib_select_default(&res);
2792
2793         if (!fl4->saddr)
2794                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2795
2796         dev_out = FIB_RES_DEV(res);
2797         fl4->flowi4_oif = dev_out->ifindex;
2798
2799
2800 make_route:
2801         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802                                tos, dev_out, flags);
2803         if (!IS_ERR(rth)) {
2804                 unsigned int hash;
2805
2806                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807                                rt_genid(dev_net(dev_out)));
2808                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809         }
2810
2811 out:
2812         rcu_read_unlock();
2813         return rth;
2814 }
2815
2816 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817 {
2818         struct rtable *rth;
2819         unsigned int hash;
2820
2821         if (!rt_caching(net))
2822                 goto slow_output;
2823
2824         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825
2826         rcu_read_lock_bh();
2827         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829                 if (rth->rt_key_dst == flp4->daddr &&
2830                     rth->rt_key_src == flp4->saddr &&
2831                     rt_is_output_route(rth) &&
2832                     rth->rt_oif == flp4->flowi4_oif &&
2833                     rth->rt_mark == flp4->flowi4_mark &&
2834                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836                     net_eq(dev_net(rth->dst.dev), net) &&
2837                     !rt_is_expired(rth)) {
2838                         ipv4_validate_peer(rth);
2839                         dst_use(&rth->dst, jiffies);
2840                         RT_CACHE_STAT_INC(out_hit);
2841                         rcu_read_unlock_bh();
2842                         if (!flp4->saddr)
2843                                 flp4->saddr = rth->rt_src;
2844                         if (!flp4->daddr)
2845                                 flp4->daddr = rth->rt_dst;
2846                         return rth;
2847                 }
2848                 RT_CACHE_STAT_INC(out_hlist_search);
2849         }
2850         rcu_read_unlock_bh();
2851
2852 slow_output:
2853         return ip_route_output_slow(net, flp4);
2854 }
2855 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
2857 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858 {
2859         return NULL;
2860 }
2861
2862 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863 {
2864         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866         return mtu ? : dst->dev->mtu;
2867 }
2868
2869 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870 {
2871 }
2872
2873 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874                                           unsigned long old)
2875 {
2876         return NULL;
2877 }
2878
2879 static struct dst_ops ipv4_dst_blackhole_ops = {
2880         .family                 =       AF_INET,
2881         .protocol               =       cpu_to_be16(ETH_P_IP),
2882         .destroy                =       ipv4_dst_destroy,
2883         .check                  =       ipv4_blackhole_dst_check,
2884         .mtu                    =       ipv4_blackhole_mtu,
2885         .default_advmss         =       ipv4_default_advmss,
2886         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2887         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2888         .neigh_lookup           =       ipv4_neigh_lookup,
2889 };
2890
2891 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892 {
2893         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894         struct rtable *ort = (struct rtable *) dst_orig;
2895
2896         if (rt) {
2897                 struct dst_entry *new = &rt->dst;
2898
2899                 new->__use = 1;
2900                 new->input = dst_discard;
2901                 new->output = dst_discard;
2902                 dst_copy_metrics(new, &ort->dst);
2903
2904                 new->dev = ort->dst.dev;
2905                 if (new->dev)
2906                         dev_hold(new->dev);
2907
2908                 rt->rt_key_dst = ort->rt_key_dst;
2909                 rt->rt_key_src = ort->rt_key_src;
2910                 rt->rt_key_tos = ort->rt_key_tos;
2911                 rt->rt_route_iif = ort->rt_route_iif;
2912                 rt->rt_iif = ort->rt_iif;
2913                 rt->rt_oif = ort->rt_oif;
2914                 rt->rt_mark = ort->rt_mark;
2915
2916                 rt->rt_genid = rt_genid(net);
2917                 rt->rt_flags = ort->rt_flags;
2918                 rt->rt_type = ort->rt_type;
2919                 rt->rt_dst = ort->rt_dst;
2920                 rt->rt_src = ort->rt_src;
2921                 rt->rt_gateway = ort->rt_gateway;
2922                 rt->rt_spec_dst = ort->rt_spec_dst;
2923                 rt->peer = ort->peer;
2924                 if (rt->peer)
2925                         atomic_inc(&rt->peer->refcnt);
2926                 rt->fi = ort->fi;
2927                 if (rt->fi)
2928                         atomic_inc(&rt->fi->fib_clntref);
2929
2930                 dst_free(new);
2931         }
2932
2933         dst_release(dst_orig);
2934
2935         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936 }
2937
2938 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939                                     struct sock *sk)
2940 {
2941         struct rtable *rt = __ip_route_output_key(net, flp4);
2942
2943         if (IS_ERR(rt))
2944                 return rt;
2945
2946         if (flp4->flowi4_proto)
2947                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948                                                    flowi4_to_flowi(flp4),
2949                                                    sk, 0);
2950
2951         return rt;
2952 }
2953 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
2955 static int rt_fill_info(struct net *net,
2956                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2957                         int nowait, unsigned int flags)
2958 {
2959         struct rtable *rt = skb_rtable(skb);
2960         struct rtmsg *r;
2961         struct nlmsghdr *nlh;
2962         unsigned long expires = 0;
2963         const struct inet_peer *peer = rt->peer;
2964         u32 id = 0, ts = 0, tsage = 0, error;
2965
2966         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967         if (nlh == NULL)
2968                 return -EMSGSIZE;
2969
2970         r = nlmsg_data(nlh);
2971         r->rtm_family    = AF_INET;
2972         r->rtm_dst_len  = 32;
2973         r->rtm_src_len  = 0;
2974         r->rtm_tos      = rt->rt_key_tos;
2975         r->rtm_table    = RT_TABLE_MAIN;
2976         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2977         r->rtm_type     = rt->rt_type;
2978         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2979         r->rtm_protocol = RTPROT_UNSPEC;
2980         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981         if (rt->rt_flags & RTCF_NOTIFY)
2982                 r->rtm_flags |= RTM_F_NOTIFY;
2983
2984         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2985
2986         if (rt->rt_key_src) {
2987                 r->rtm_src_len = 32;
2988                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2989         }
2990         if (rt->dst.dev)
2991                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2992 #ifdef CONFIG_IP_ROUTE_CLASSID
2993         if (rt->dst.tclassid)
2994                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2995 #endif
2996         if (rt_is_input_route(rt))
2997                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2998         else if (rt->rt_src != rt->rt_key_src)
2999                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3000
3001         if (rt->rt_dst != rt->rt_gateway)
3002                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3003
3004         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005                 goto nla_put_failure;
3006
3007         if (rt->rt_mark)
3008                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3009
3010         error = rt->dst.error;
3011         if (peer) {
3012                 inet_peer_refcheck(rt->peer);
3013                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3014                 if (peer->tcp_ts_stamp) {
3015                         ts = peer->tcp_ts;
3016                         tsage = get_seconds() - peer->tcp_ts_stamp;
3017                 }
3018                 expires = ACCESS_ONCE(peer->pmtu_expires);
3019                 if (expires) {
3020                         if (time_before(jiffies, expires))
3021                                 expires -= jiffies;
3022                         else
3023                                 expires = 0;
3024                 }
3025         }
3026
3027         if (rt_is_input_route(rt)) {
3028 #ifdef CONFIG_IP_MROUTE
3029                 __be32 dst = rt->rt_dst;
3030
3031                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3032                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3033                         int err = ipmr_get_route(net, skb,
3034                                                  rt->rt_src, rt->rt_dst,
3035                                                  r, nowait);
3036                         if (err <= 0) {
3037                                 if (!nowait) {
3038                                         if (err == 0)
3039                                                 return 0;
3040                                         goto nla_put_failure;
3041                                 } else {
3042                                         if (err == -EMSGSIZE)
3043                                                 goto nla_put_failure;
3044                                         error = err;
3045                                 }
3046                         }
3047                 } else
3048 #endif
3049                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3050         }
3051
3052         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3053                                expires, error) < 0)
3054                 goto nla_put_failure;
3055
3056         return nlmsg_end(skb, nlh);
3057
3058 nla_put_failure:
3059         nlmsg_cancel(skb, nlh);
3060         return -EMSGSIZE;
3061 }
3062
3063 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3064 {
3065         struct net *net = sock_net(in_skb->sk);
3066         struct rtmsg *rtm;
3067         struct nlattr *tb[RTA_MAX+1];
3068         struct rtable *rt = NULL;
3069         __be32 dst = 0;
3070         __be32 src = 0;
3071         u32 iif;
3072         int err;
3073         int mark;
3074         struct sk_buff *skb;
3075
3076         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077         if (err < 0)
3078                 goto errout;
3079
3080         rtm = nlmsg_data(nlh);
3081
3082         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3083         if (skb == NULL) {
3084                 err = -ENOBUFS;
3085                 goto errout;
3086         }
3087
3088         /* Reserve room for dummy headers, this skb can pass
3089            through good chunk of routing engine.
3090          */
3091         skb_reset_mac_header(skb);
3092         skb_reset_network_header(skb);
3093
3094         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3095         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3096         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097
3098         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3100         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3101         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3102
3103         if (iif) {
3104                 struct net_device *dev;
3105
3106                 dev = __dev_get_by_index(net, iif);
3107                 if (dev == NULL) {
3108                         err = -ENODEV;
3109                         goto errout_free;
3110                 }
3111
3112                 skb->protocol   = htons(ETH_P_IP);
3113                 skb->dev        = dev;
3114                 skb->mark       = mark;
3115                 local_bh_disable();
3116                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117                 local_bh_enable();
3118
3119                 rt = skb_rtable(skb);
3120                 if (err == 0 && rt->dst.error)
3121                         err = -rt->dst.error;
3122         } else {
3123                 struct flowi4 fl4 = {
3124                         .daddr = dst,
3125                         .saddr = src,
3126                         .flowi4_tos = rtm->rtm_tos,
3127                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128                         .flowi4_mark = mark,
3129                 };
3130                 rt = ip_route_output_key(net, &fl4);
3131
3132                 err = 0;
3133                 if (IS_ERR(rt))
3134                         err = PTR_ERR(rt);
3135         }
3136
3137         if (err)
3138                 goto errout_free;
3139
3140         skb_dst_set(skb, &rt->dst);
3141         if (rtm->rtm_flags & RTM_F_NOTIFY)
3142                 rt->rt_flags |= RTCF_NOTIFY;
3143
3144         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3145                            RTM_NEWROUTE, 0, 0);
3146         if (err <= 0)
3147                 goto errout_free;
3148
3149         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3150 errout:
3151         return err;
3152
3153 errout_free:
3154         kfree_skb(skb);
3155         goto errout;
3156 }
3157
3158 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3159 {
3160         struct rtable *rt;
3161         int h, s_h;
3162         int idx, s_idx;
3163         struct net *net;
3164
3165         net = sock_net(skb->sk);
3166
3167         s_h = cb->args[0];
3168         if (s_h < 0)
3169                 s_h = 0;
3170         s_idx = idx = cb->args[1];
3171         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172                 if (!rt_hash_table[h].chain)
3173                         continue;
3174                 rcu_read_lock_bh();
3175                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3176                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3178                                 continue;
3179                         if (rt_is_expired(rt))
3180                                 continue;
3181                         skb_dst_set_noref(skb, &rt->dst);
3182                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3183                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3184                                          1, NLM_F_MULTI) <= 0) {
3185                                 skb_dst_drop(skb);
3186                                 rcu_read_unlock_bh();
3187                                 goto done;
3188                         }
3189                         skb_dst_drop(skb);
3190                 }
3191                 rcu_read_unlock_bh();
3192         }
3193
3194 done:
3195         cb->args[0] = h;
3196         cb->args[1] = idx;
3197         return skb->len;
3198 }
3199
3200 void ip_rt_multicast_event(struct in_device *in_dev)
3201 {
3202         rt_cache_flush(dev_net(in_dev->dev), 0);
3203 }
3204
3205 #ifdef CONFIG_SYSCTL
3206 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3207                                         void __user *buffer,
3208                                         size_t *lenp, loff_t *ppos)
3209 {
3210         if (write) {
3211                 int flush_delay;
3212                 ctl_table ctl;
3213                 struct net *net;
3214
3215                 memcpy(&ctl, __ctl, sizeof(ctl));
3216                 ctl.data = &flush_delay;
3217                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3218
3219                 net = (struct net *)__ctl->extra1;
3220                 rt_cache_flush(net, flush_delay);
3221                 return 0;
3222         }
3223
3224         return -EINVAL;
3225 }
3226
3227 static ctl_table ipv4_route_table[] = {
3228         {
3229                 .procname       = "gc_thresh",
3230                 .data           = &ipv4_dst_ops.gc_thresh,
3231                 .maxlen         = sizeof(int),
3232                 .mode           = 0644,
3233                 .proc_handler   = proc_dointvec,
3234         },
3235         {
3236                 .procname       = "max_size",
3237                 .data           = &ip_rt_max_size,
3238                 .maxlen         = sizeof(int),
3239                 .mode           = 0644,
3240                 .proc_handler   = proc_dointvec,
3241         },
3242         {
3243                 /*  Deprecated. Use gc_min_interval_ms */
3244
3245                 .procname       = "gc_min_interval",
3246                 .data           = &ip_rt_gc_min_interval,
3247                 .maxlen         = sizeof(int),
3248                 .mode           = 0644,
3249                 .proc_handler   = proc_dointvec_jiffies,
3250         },
3251         {
3252                 .procname       = "gc_min_interval_ms",
3253                 .data           = &ip_rt_gc_min_interval,
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0644,
3256                 .proc_handler   = proc_dointvec_ms_jiffies,
3257         },
3258         {
3259                 .procname       = "gc_timeout",
3260                 .data           = &ip_rt_gc_timeout,
3261                 .maxlen         = sizeof(int),
3262                 .mode           = 0644,
3263                 .proc_handler   = proc_dointvec_jiffies,
3264         },
3265         {
3266                 .procname       = "gc_interval",
3267                 .data           = &ip_rt_gc_interval,
3268                 .maxlen         = sizeof(int),
3269                 .mode           = 0644,
3270                 .proc_handler   = proc_dointvec_jiffies,
3271         },
3272         {
3273                 .procname       = "redirect_load",
3274                 .data           = &ip_rt_redirect_load,
3275                 .maxlen         = sizeof(int),
3276                 .mode           = 0644,
3277                 .proc_handler   = proc_dointvec,
3278         },
3279         {
3280                 .procname       = "redirect_number",
3281                 .data           = &ip_rt_redirect_number,
3282                 .maxlen         = sizeof(int),
3283                 .mode           = 0644,
3284                 .proc_handler   = proc_dointvec,
3285         },
3286         {
3287                 .procname       = "redirect_silence",
3288                 .data           = &ip_rt_redirect_silence,
3289                 .maxlen         = sizeof(int),
3290                 .mode           = 0644,
3291                 .proc_handler   = proc_dointvec,
3292         },
3293         {
3294                 .procname       = "error_cost",
3295                 .data           = &ip_rt_error_cost,
3296                 .maxlen         = sizeof(int),
3297                 .mode           = 0644,
3298                 .proc_handler   = proc_dointvec,
3299         },
3300         {
3301                 .procname       = "error_burst",
3302                 .data           = &ip_rt_error_burst,
3303                 .maxlen         = sizeof(int),
3304                 .mode           = 0644,
3305                 .proc_handler   = proc_dointvec,
3306         },
3307         {
3308                 .procname       = "gc_elasticity",
3309                 .data           = &ip_rt_gc_elasticity,
3310                 .maxlen         = sizeof(int),
3311                 .mode           = 0644,
3312                 .proc_handler   = proc_dointvec,
3313         },
3314         {
3315                 .procname       = "mtu_expires",
3316                 .data           = &ip_rt_mtu_expires,
3317                 .maxlen         = sizeof(int),
3318                 .mode           = 0644,
3319                 .proc_handler   = proc_dointvec_jiffies,
3320         },
3321         {
3322                 .procname       = "min_pmtu",
3323                 .data           = &ip_rt_min_pmtu,
3324                 .maxlen         = sizeof(int),
3325                 .mode           = 0644,
3326                 .proc_handler   = proc_dointvec,
3327         },
3328         {
3329                 .procname       = "min_adv_mss",
3330                 .data           = &ip_rt_min_advmss,
3331                 .maxlen         = sizeof(int),
3332                 .mode           = 0644,
3333                 .proc_handler   = proc_dointvec,
3334         },
3335         { }
3336 };
3337
3338 static struct ctl_table empty[1];
3339
3340 static struct ctl_table ipv4_skeleton[] =
3341 {
3342         { .procname = "route", 
3343           .mode = 0555, .child = ipv4_route_table},
3344         { .procname = "neigh", 
3345           .mode = 0555, .child = empty},
3346         { }
3347 };
3348
3349 static __net_initdata struct ctl_path ipv4_path[] = {
3350         { .procname = "net", },
3351         { .procname = "ipv4", },
3352         { },
3353 };
3354
3355 static struct ctl_table ipv4_route_flush_table[] = {
3356         {
3357                 .procname       = "flush",
3358                 .maxlen         = sizeof(int),
3359                 .mode           = 0200,
3360                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3361         },
3362         { },
3363 };
3364
3365 static __net_initdata struct ctl_path ipv4_route_path[] = {
3366         { .procname = "net", },
3367         { .procname = "ipv4", },
3368         { .procname = "route", },
3369         { },
3370 };
3371
3372 static __net_init int sysctl_route_net_init(struct net *net)
3373 {
3374         struct ctl_table *tbl;
3375
3376         tbl = ipv4_route_flush_table;
3377         if (!net_eq(net, &init_net)) {
3378                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379                 if (tbl == NULL)
3380                         goto err_dup;
3381         }
3382         tbl[0].extra1 = net;
3383
3384         net->ipv4.route_hdr =
3385                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3386         if (net->ipv4.route_hdr == NULL)
3387                 goto err_reg;
3388         return 0;
3389
3390 err_reg:
3391         if (tbl != ipv4_route_flush_table)
3392                 kfree(tbl);
3393 err_dup:
3394         return -ENOMEM;
3395 }
3396
3397 static __net_exit void sysctl_route_net_exit(struct net *net)
3398 {
3399         struct ctl_table *tbl;
3400
3401         tbl = net->ipv4.route_hdr->ctl_table_arg;
3402         unregister_net_sysctl_table(net->ipv4.route_hdr);
3403         BUG_ON(tbl == ipv4_route_flush_table);
3404         kfree(tbl);
3405 }
3406
3407 static __net_initdata struct pernet_operations sysctl_route_ops = {
3408         .init = sysctl_route_net_init,
3409         .exit = sysctl_route_net_exit,
3410 };
3411 #endif
3412
3413 static __net_init int rt_genid_init(struct net *net)
3414 {
3415         get_random_bytes(&net->ipv4.rt_genid,
3416                          sizeof(net->ipv4.rt_genid));
3417         get_random_bytes(&net->ipv4.dev_addr_genid,
3418                          sizeof(net->ipv4.dev_addr_genid));
3419         return 0;
3420 }
3421
3422 static __net_initdata struct pernet_operations rt_genid_ops = {
3423         .init = rt_genid_init,
3424 };
3425
3426
3427 #ifdef CONFIG_IP_ROUTE_CLASSID
3428 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3429 #endif /* CONFIG_IP_ROUTE_CLASSID */
3430
3431 static __initdata unsigned long rhash_entries;
3432 static int __init set_rhash_entries(char *str)
3433 {
3434         if (!str)
3435                 return 0;
3436         rhash_entries = simple_strtoul(str, &str, 0);
3437         return 1;
3438 }
3439 __setup("rhash_entries=", set_rhash_entries);
3440
3441 int __init ip_rt_init(void)
3442 {
3443         int rc = 0;
3444
3445 #ifdef CONFIG_IP_ROUTE_CLASSID
3446         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3447         if (!ip_rt_acct)
3448                 panic("IP: failed to allocate ip_rt_acct\n");
3449 #endif
3450
3451         ipv4_dst_ops.kmem_cachep =
3452                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3453                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3454
3455         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456
3457         if (dst_entries_init(&ipv4_dst_ops) < 0)
3458                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459
3460         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462
3463         rt_hash_table = (struct rt_hash_bucket *)
3464                 alloc_large_system_hash("IP route cache",
3465                                         sizeof(struct rt_hash_bucket),
3466                                         rhash_entries,
3467                                         (totalram_pages >= 128 * 1024) ?
3468                                         15 : 17,
3469                                         0,
3470                                         &rt_hash_log,
3471                                         &rt_hash_mask,
3472                                         rhash_entries ? 0 : 512 * 1024);
3473         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474         rt_hash_lock_init();
3475
3476         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478
3479         devinet_init();
3480         ip_fib_init();
3481
3482         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483         expires_ljiffies = jiffies;
3484         schedule_delayed_work(&expires_work,
3485                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486
3487         if (ip_rt_proc_init())
3488                 printk(KERN_ERR "Unable to create route proc files\n");
3489 #ifdef CONFIG_XFRM
3490         xfrm_init();
3491         xfrm4_init(ip_rt_max_size);
3492 #endif
3493         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3494
3495 #ifdef CONFIG_SYSCTL
3496         register_pernet_subsys(&sysctl_route_ops);
3497 #endif
3498         register_pernet_subsys(&rt_genid_ops);
3499         return rc;
3500 }
3501
3502 #ifdef CONFIG_SYSCTL
3503 /*
3504  * We really need to sanitize the damn ipv4 init order, then all
3505  * this nonsense will go away.
3506  */
3507 void __init ip_static_sysctl_init(void)
3508 {
3509         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3510 }
3511 #endif