2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132 static int rt_chain_length_max __read_mostly = 20;
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
138 * Interface to generic destination cache.
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void ipv4_dst_destroy(struct dst_entry *dst);
143 static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
151 static struct dst_ops ipv4_dst_ops = {
153 .protocol = cpu_to_be16(ETH_P_IP),
154 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check,
156 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out,
164 #define ECN_OR_COST(class) TC_PRIO_##class
166 const __u8 ip_tos2prio[16] = {
170 ECN_OR_COST(BESTEFFORT),
176 ECN_OR_COST(INTERACTIVE),
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE_BULK,
180 ECN_OR_COST(INTERACTIVE_BULK),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK)
190 /* The locking scheme is rather straight forward:
192 * 1) Read-Copy Update protects the buckets of the central route hash.
193 * 2) Only writers remove entries, and they hold the lock
194 * as they look at rtable reference counts.
195 * 3) Only readers acquire references to rtable entries,
196 * they do so with atomic increments and with the
200 struct rt_hash_bucket {
201 struct rtable *chain;
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING)
207 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208 * The size of this table is a power of two and depends on the number of CPUS.
209 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ 256
215 # define RT_HASH_LOCK_SZ 4096
217 # define RT_HASH_LOCK_SZ 2048
219 # define RT_HASH_LOCK_SZ 1024
221 # define RT_HASH_LOCK_SZ 512
223 # define RT_HASH_LOCK_SZ 256
227 static spinlock_t *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230 static __init void rt_hash_lock_init(void)
234 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 panic("IP: failed to allocate rt_hash_locks\n");
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 spin_lock_init(&rt_hash_locks[i]);
243 # define rt_hash_lock_addr(slot) NULL
245 static inline void rt_hash_lock_init(void)
250 static struct rt_hash_bucket *rt_hash_table __read_mostly;
251 static unsigned rt_hash_mask __read_mostly;
252 static unsigned int rt_hash_log __read_mostly;
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
260 return jhash_3words((__force u32)daddr, (__force u32)saddr,
265 static inline int rt_genid(struct net *net)
267 return atomic_read(&net->ipv4.rt_genid);
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 struct seq_net_private p;
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 struct rt_cache_iter_state *st = seq->private;
280 struct rtable *r = NULL;
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 if (!rt_hash_table[st->bucket].chain)
286 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
288 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
289 r->rt_genid == st->genid)
291 r = rcu_dereference_bh(r->dst.rt_next);
293 rcu_read_unlock_bh();
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301 struct rt_cache_iter_state *st = seq->private;
305 rcu_read_unlock_bh();
307 if (--st->bucket < 0)
309 } while (!rt_hash_table[st->bucket].chain);
311 r = rt_hash_table[st->bucket].chain;
313 return rcu_dereference_bh(r);
316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
319 struct rt_cache_iter_state *st = seq->private;
320 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
321 if (dev_net(r->dst.dev) != seq_file_net(seq))
323 if (r->rt_genid == st->genid)
329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
331 struct rtable *r = rt_cache_get_first(seq);
334 while (pos && (r = rt_cache_get_next(seq, r)))
336 return pos ? NULL : r;
339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
341 struct rt_cache_iter_state *st = seq->private;
343 return rt_cache_get_idx(seq, *pos - 1);
344 st->genid = rt_genid(seq_file_net(seq));
345 return SEQ_START_TOKEN;
348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
352 if (v == SEQ_START_TOKEN)
353 r = rt_cache_get_first(seq);
355 r = rt_cache_get_next(seq, v);
360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
362 if (v && v != SEQ_START_TOKEN)
363 rcu_read_unlock_bh();
366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
368 if (v == SEQ_START_TOKEN)
369 seq_printf(seq, "%-127s\n",
370 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
371 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
374 struct rtable *r = v;
377 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
378 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
379 r->dst.dev ? r->dst.dev->name : "*",
380 (__force u32)r->rt_dst,
381 (__force u32)r->rt_gateway,
382 r->rt_flags, atomic_read(&r->dst.__refcnt),
383 r->dst.__use, 0, (__force u32)r->rt_src,
384 (dst_metric(&r->dst, RTAX_ADVMSS) ?
385 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386 dst_metric(&r->dst, RTAX_WINDOW),
387 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388 dst_metric(&r->dst, RTAX_RTTVAR)),
390 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
391 r->dst.hh ? (r->dst.hh->hh_output ==
393 r->rt_spec_dst, &len);
395 seq_printf(seq, "%*s\n", 127 - len, "");
400 static const struct seq_operations rt_cache_seq_ops = {
401 .start = rt_cache_seq_start,
402 .next = rt_cache_seq_next,
403 .stop = rt_cache_seq_stop,
404 .show = rt_cache_seq_show,
407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
409 return seq_open_net(inode, file, &rt_cache_seq_ops,
410 sizeof(struct rt_cache_iter_state));
413 static const struct file_operations rt_cache_seq_fops = {
414 .owner = THIS_MODULE,
415 .open = rt_cache_seq_open,
418 .release = seq_release_net,
422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
427 return SEQ_START_TOKEN;
429 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
430 if (!cpu_possible(cpu))
433 return &per_cpu(rt_cache_stat, cpu);
438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
443 if (!cpu_possible(cpu))
446 return &per_cpu(rt_cache_stat, cpu);
452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459 struct rt_cache_stat *st = v;
461 if (v == SEQ_START_TOKEN) {
462 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
467 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
468 dst_entries_get_slow(&ipv4_dst_ops),
491 static const struct seq_operations rt_cpu_seq_ops = {
492 .start = rt_cpu_seq_start,
493 .next = rt_cpu_seq_next,
494 .stop = rt_cpu_seq_stop,
495 .show = rt_cpu_seq_show,
499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501 return seq_open(file, &rt_cpu_seq_ops);
504 static const struct file_operations rt_cpu_seq_fops = {
505 .owner = THIS_MODULE,
506 .open = rt_cpu_seq_open,
509 .release = seq_release,
512 #ifdef CONFIG_NET_CLS_ROUTE
513 static int rt_acct_proc_show(struct seq_file *m, void *v)
515 struct ip_rt_acct *dst, *src;
518 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 for_each_possible_cpu(i) {
523 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
524 for (j = 0; j < 256; j++) {
525 dst[j].o_bytes += src[j].o_bytes;
526 dst[j].o_packets += src[j].o_packets;
527 dst[j].i_bytes += src[j].i_bytes;
528 dst[j].i_packets += src[j].i_packets;
532 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
539 return single_open(file, rt_acct_proc_show, NULL);
542 static const struct file_operations rt_acct_proc_fops = {
543 .owner = THIS_MODULE,
544 .open = rt_acct_proc_open,
547 .release = single_release,
551 static int __net_init ip_rt_do_proc_init(struct net *net)
553 struct proc_dir_entry *pde;
555 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 pde = proc_create("rt_cache", S_IRUGO,
561 net->proc_net_stat, &rt_cpu_seq_fops);
565 #ifdef CONFIG_NET_CLS_ROUTE
566 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 #ifdef CONFIG_NET_CLS_ROUTE
574 remove_proc_entry("rt_cache", net->proc_net_stat);
577 remove_proc_entry("rt_cache", net->proc_net);
582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
584 remove_proc_entry("rt_cache", net->proc_net_stat);
585 remove_proc_entry("rt_cache", net->proc_net);
586 #ifdef CONFIG_NET_CLS_ROUTE
587 remove_proc_entry("rt_acct", net->proc_net);
591 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
592 .init = ip_rt_do_proc_init,
593 .exit = ip_rt_do_proc_exit,
596 static int __init ip_rt_proc_init(void)
598 return register_pernet_subsys(&ip_rt_proc_ops);
602 static inline int ip_rt_proc_init(void)
606 #endif /* CONFIG_PROC_FS */
608 static inline void rt_free(struct rtable *rt)
610 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
613 static inline void rt_drop(struct rtable *rt)
616 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
619 static inline int rt_fast_clean(struct rtable *rth)
621 /* Kill broadcast/multicast entries very aggresively, if they
622 collide in hash table with more useful entries */
623 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624 rth->fl.iif && rth->dst.rt_next;
627 static inline int rt_valuable(struct rtable *rth)
629 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
638 if (atomic_read(&rth->dst.__refcnt))
642 if (rth->dst.expires &&
643 time_after_eq(jiffies, rth->dst.expires))
646 age = jiffies - rth->dst.lastuse;
648 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
649 (age <= tmo2 && rt_valuable(rth)))
655 /* Bits of score are:
657 * 30: not quite useless
658 * 29..0: usage counter
660 static inline u32 rt_score(struct rtable *rt)
662 u32 score = jiffies - rt->dst.lastuse;
664 score = ~score & ~(3<<30);
670 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
676 static inline bool rt_caching(const struct net *net)
678 return net->ipv4.current_rt_cache_rebuild_count <=
679 net->ipv4.sysctl_rt_cache_rebuild_count;
682 static inline bool compare_hash_inputs(const struct flowi *fl1,
683 const struct flowi *fl2)
685 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
686 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
687 (fl1->iif ^ fl2->iif)) == 0);
690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
693 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
694 (fl1->mark ^ fl2->mark) |
695 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
696 (fl1->oif ^ fl2->oif) |
697 (fl1->iif ^ fl2->iif)) == 0;
700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
702 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
705 static inline int rt_is_expired(struct rtable *rth)
707 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
711 * Perform a full scan of hash table and free all entries.
712 * Can be called by a softirq or a process.
713 * In the later case, we want to be reschedule if necessary
715 static void rt_do_flush(int process_context)
718 struct rtable *rth, *next;
719 struct rtable * tail;
721 for (i = 0; i <= rt_hash_mask; i++) {
722 if (process_context && need_resched())
724 rth = rt_hash_table[i].chain;
728 spin_lock_bh(rt_hash_lock_addr(i));
731 struct rtable ** prev, * p;
733 rth = rt_hash_table[i].chain;
735 /* defer releasing the head of the list after spin_unlock */
736 for (tail = rth; tail; tail = tail->dst.rt_next)
737 if (!rt_is_expired(tail))
740 rt_hash_table[i].chain = tail;
742 /* call rt_free on entries after the tail requiring flush */
743 prev = &rt_hash_table[i].chain;
744 for (p = *prev; p; p = next) {
745 next = p->dst.rt_next;
746 if (!rt_is_expired(p)) {
747 prev = &p->dst.rt_next;
755 rth = rt_hash_table[i].chain;
756 rt_hash_table[i].chain = NULL;
759 spin_unlock_bh(rt_hash_lock_addr(i));
761 for (; rth != tail; rth = next) {
762 next = rth->dst.rt_next;
769 * While freeing expired entries, we compute average chain length
770 * and standard deviation, using fixed-point arithmetic.
771 * This to have an estimation of rt_chain_length_max
772 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
773 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
777 #define ONE (1UL << FRACT_BITS)
780 * Given a hash chain and an item in this hash chain,
781 * find if a previous entry has the same hash_inputs
782 * (but differs on tos, mark or oif)
783 * Returns 0 if an alias is found.
784 * Returns ONE if rth has no alias before itself.
786 static int has_noalias(const struct rtable *head, const struct rtable *rth)
788 const struct rtable *aux = head;
791 if (compare_hash_inputs(&aux->fl, &rth->fl))
793 aux = aux->dst.rt_next;
798 static void rt_check_expire(void)
800 static unsigned int rover;
801 unsigned int i = rover, goal;
802 struct rtable *rth, **rthp;
803 unsigned long samples = 0;
804 unsigned long sum = 0, sum2 = 0;
808 delta = jiffies - expires_ljiffies;
809 expires_ljiffies = jiffies;
810 mult = ((u64)delta) << rt_hash_log;
811 if (ip_rt_gc_timeout > 1)
812 do_div(mult, ip_rt_gc_timeout);
813 goal = (unsigned int)mult;
814 if (goal > rt_hash_mask)
815 goal = rt_hash_mask + 1;
816 for (; goal > 0; goal--) {
817 unsigned long tmo = ip_rt_gc_timeout;
818 unsigned long length;
820 i = (i + 1) & rt_hash_mask;
821 rthp = &rt_hash_table[i].chain;
831 spin_lock_bh(rt_hash_lock_addr(i));
832 while ((rth = *rthp) != NULL) {
833 prefetch(rth->dst.rt_next);
834 if (rt_is_expired(rth)) {
835 *rthp = rth->dst.rt_next;
839 if (rth->dst.expires) {
840 /* Entry is expired even if it is in use */
841 if (time_before_eq(jiffies, rth->dst.expires)) {
844 rthp = &rth->dst.rt_next;
846 * We only count entries on
847 * a chain with equal hash inputs once
848 * so that entries for different QOS
849 * levels, and other non-hash input
850 * attributes don't unfairly skew
851 * the length computation
853 length += has_noalias(rt_hash_table[i].chain, rth);
856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
859 /* Cleanup aged off entries. */
860 *rthp = rth->dst.rt_next;
863 spin_unlock_bh(rt_hash_lock_addr(i));
865 sum2 += length*length;
868 unsigned long avg = sum / samples;
869 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 rt_chain_length_max = max_t(unsigned long,
872 (avg + 4*sd) >> FRACT_BITS);
878 * rt_worker_func() is run in process context.
879 * we call rt_check_expire() to scan part of the hash table
881 static void rt_worker_func(struct work_struct *work)
884 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
888 * Pertubation of rt_genid by a small quantity [1..256]
889 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
890 * many times (2^24) without giving recent rt_genid.
891 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
893 static void rt_cache_invalidate(struct net *net)
895 unsigned char shuffle;
897 get_random_bytes(&shuffle, sizeof(shuffle));
898 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
902 * delay < 0 : invalidate cache (fast : entries will be deleted later)
903 * delay >= 0 : invalidate & flush cache (can be long)
905 void rt_cache_flush(struct net *net, int delay)
907 rt_cache_invalidate(net);
909 rt_do_flush(!in_softirq());
912 /* Flush previous cache invalidated entries from the cache */
913 void rt_cache_flush_batch(void)
915 rt_do_flush(!in_softirq());
918 static void rt_emergency_hash_rebuild(struct net *net)
921 printk(KERN_WARNING "Route hash chain too long!\n");
922 rt_cache_invalidate(net);
926 Short description of GC goals.
928 We want to build algorithm, which will keep routing cache
929 at some equilibrium point, when number of aged off entries
930 is kept approximately equal to newly generated ones.
932 Current expiration strength is variable "expire".
933 We try to adjust it dynamically, so that if networking
934 is idle expires is large enough to keep enough of warm entries,
935 and when load increases it reduces to limit cache size.
938 static int rt_garbage_collect(struct dst_ops *ops)
940 static unsigned long expire = RT_GC_TIMEOUT;
941 static unsigned long last_gc;
943 static int equilibrium;
944 struct rtable *rth, **rthp;
945 unsigned long now = jiffies;
947 int entries = dst_entries_get_fast(&ipv4_dst_ops);
950 * Garbage collection is pretty expensive,
951 * do not make it too frequently.
954 RT_CACHE_STAT_INC(gc_total);
956 if (now - last_gc < ip_rt_gc_min_interval &&
957 entries < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored);
962 entries = dst_entries_get_slow(&ipv4_dst_ops);
963 /* Calculate number of entries, which we want to expire now. */
964 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
966 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = entries - equilibrium;
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = entries - equilibrium;
974 /* We are in dangerous area. Try to reduce cache really
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = entries - goal;
981 if (now - last_gc >= ip_rt_gc_min_interval)
992 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993 unsigned long tmo = expire;
995 k = (k + 1) & rt_hash_mask;
996 rthp = &rt_hash_table[k].chain;
997 spin_lock_bh(rt_hash_lock_addr(k));
998 while ((rth = *rthp) != NULL) {
999 if (!rt_is_expired(rth) &&
1000 !rt_may_expire(rth, tmo, expire)) {
1002 rthp = &rth->dst.rt_next;
1005 *rthp = rth->dst.rt_next;
1009 spin_unlock_bh(rt_hash_lock_addr(k));
1018 /* Goal is not achieved. We stop process if:
1020 - if expire reduced to zero. Otherwise, expire is halfed.
1021 - if table is not full.
1022 - if we are called from interrupt.
1023 - jiffies check is just fallback/debug loop breaker.
1024 We will not spin here for long time in any case.
1027 RT_CACHE_STAT_INC(gc_goal_miss);
1033 #if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1038 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1040 } while (!in_softirq() && time_before_eq(jiffies, now));
1042 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1044 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1046 if (net_ratelimit())
1047 printk(KERN_WARNING "dst cache overflow\n");
1048 RT_CACHE_STAT_INC(gc_dst_overflow);
1052 expire += ip_rt_gc_min_interval;
1053 if (expire > ip_rt_gc_timeout ||
1054 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1056 expire = ip_rt_gc_timeout;
1057 #if RT_CACHE_DEBUG >= 2
1058 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1059 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1065 * Returns number of entries in a hash chain that have different hash_inputs
1067 static int slow_chain_length(const struct rtable *head)
1070 const struct rtable *rth = head;
1073 length += has_noalias(head, rth);
1074 rth = rth->dst.rt_next;
1076 return length >> FRACT_BITS;
1079 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1080 struct rtable **rp, struct sk_buff *skb, int ifindex)
1082 struct rtable *rth, **rthp;
1084 struct rtable *cand, **candp;
1087 int attempts = !in_softirq();
1091 min_score = ~(u32)0;
1096 if (!rt_caching(dev_net(rt->dst.dev))) {
1098 * If we're not caching, just tell the caller we
1099 * were successful and don't touch the route. The
1100 * caller hold the sole reference to the cache entry, and
1101 * it will be released when the caller is done with it.
1102 * If we drop it here, the callers have no way to resolve routes
1103 * when we're not caching. Instead, just point *rp at rt, so
1104 * the caller gets a single use out of the route
1105 * Note that we do rt_free on this new route entry, so that
1106 * once its refcount hits zero, we are still able to reap it
1108 * Note also the rt_free uses call_rcu. We don't actually
1109 * need rcu protection here, this is just our path to get
1110 * on the route gc list.
1113 rt->dst.flags |= DST_NOCACHE;
1114 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1115 int err = arp_bind_neighbour(&rt->dst);
1117 if (net_ratelimit())
1119 "Neighbour table failure & not caching routes.\n");
1129 rthp = &rt_hash_table[hash].chain;
1131 spin_lock_bh(rt_hash_lock_addr(hash));
1132 while ((rth = *rthp) != NULL) {
1133 if (rt_is_expired(rth)) {
1134 *rthp = rth->dst.rt_next;
1138 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1140 *rthp = rth->dst.rt_next;
1142 * Since lookup is lockfree, the deletion
1143 * must be visible to another weakly ordered CPU before
1144 * the insertion at the start of the hash chain.
1146 rcu_assign_pointer(rth->dst.rt_next,
1147 rt_hash_table[hash].chain);
1149 * Since lookup is lockfree, the update writes
1150 * must be ordered for consistency on SMP.
1152 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1154 dst_use(&rth->dst, now);
1155 spin_unlock_bh(rt_hash_lock_addr(hash));
1161 skb_dst_set(skb, &rth->dst);
1165 if (!atomic_read(&rth->dst.__refcnt)) {
1166 u32 score = rt_score(rth);
1168 if (score <= min_score) {
1177 rthp = &rth->dst.rt_next;
1181 /* ip_rt_gc_elasticity used to be average length of chain
1182 * length, when exceeded gc becomes really aggressive.
1184 * The second limit is less certain. At the moment it allows
1185 * only 2 entries per bucket. We will see.
1187 if (chain_length > ip_rt_gc_elasticity) {
1188 *candp = cand->dst.rt_next;
1192 if (chain_length > rt_chain_length_max &&
1193 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1194 struct net *net = dev_net(rt->dst.dev);
1195 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1196 if (!rt_caching(net)) {
1197 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1198 rt->dst.dev->name, num);
1200 rt_emergency_hash_rebuild(net);
1201 spin_unlock_bh(rt_hash_lock_addr(hash));
1203 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1204 ifindex, rt_genid(net));
1209 /* Try to bind route to arp only if it is output
1210 route or unicast forwarding path.
1212 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1213 int err = arp_bind_neighbour(&rt->dst);
1215 spin_unlock_bh(rt_hash_lock_addr(hash));
1217 if (err != -ENOBUFS) {
1222 /* Neighbour tables are full and nothing
1223 can be released. Try to shrink route cache,
1224 it is most likely it holds some neighbour records.
1226 if (attempts-- > 0) {
1227 int saved_elasticity = ip_rt_gc_elasticity;
1228 int saved_int = ip_rt_gc_min_interval;
1229 ip_rt_gc_elasticity = 1;
1230 ip_rt_gc_min_interval = 0;
1231 rt_garbage_collect(&ipv4_dst_ops);
1232 ip_rt_gc_min_interval = saved_int;
1233 ip_rt_gc_elasticity = saved_elasticity;
1237 if (net_ratelimit())
1238 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1244 rt->dst.rt_next = rt_hash_table[hash].chain;
1246 #if RT_CACHE_DEBUG >= 2
1247 if (rt->dst.rt_next) {
1249 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1251 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1252 printk(" . %pI4", &trt->rt_dst);
1257 * Since lookup is lockfree, we must make sure
1258 * previous writes to rt are comitted to memory
1259 * before making rt visible to other CPUS.
1261 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1263 spin_unlock_bh(rt_hash_lock_addr(hash));
1269 skb_dst_set(skb, &rt->dst);
1273 void rt_bind_peer(struct rtable *rt, int create)
1275 struct inet_peer *peer;
1277 peer = inet_getpeer(rt->rt_dst, create);
1279 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1284 * Peer allocation may fail only in serious out-of-memory conditions. However
1285 * we still can generate some output.
1286 * Random ID selection looks a bit dangerous because we have no chances to
1287 * select ID being unique in a reasonable period of time.
1288 * But broken packet identifier may be better than no packet at all.
1290 static void ip_select_fb_ident(struct iphdr *iph)
1292 static DEFINE_SPINLOCK(ip_fb_id_lock);
1293 static u32 ip_fallback_id;
1296 spin_lock_bh(&ip_fb_id_lock);
1297 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1298 iph->id = htons(salt & 0xFFFF);
1299 ip_fallback_id = salt;
1300 spin_unlock_bh(&ip_fb_id_lock);
1303 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1305 struct rtable *rt = (struct rtable *) dst;
1308 if (rt->peer == NULL)
1309 rt_bind_peer(rt, 1);
1311 /* If peer is attached to destination, it is never detached,
1312 so that we need not to grab a lock to dereference it.
1315 iph->id = htons(inet_getid(rt->peer, more));
1319 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1320 __builtin_return_address(0));
1322 ip_select_fb_ident(iph);
1324 EXPORT_SYMBOL(__ip_select_ident);
1326 static void rt_del(unsigned hash, struct rtable *rt)
1328 struct rtable **rthp, *aux;
1330 rthp = &rt_hash_table[hash].chain;
1331 spin_lock_bh(rt_hash_lock_addr(hash));
1333 while ((aux = *rthp) != NULL) {
1334 if (aux == rt || rt_is_expired(aux)) {
1335 *rthp = aux->dst.rt_next;
1339 rthp = &aux->dst.rt_next;
1341 spin_unlock_bh(rt_hash_lock_addr(hash));
1344 /* called in rcu_read_lock() section */
1345 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1346 __be32 saddr, struct net_device *dev)
1349 struct in_device *in_dev = __in_dev_get_rcu(dev);
1350 struct rtable *rth, **rthp;
1351 __be32 skeys[2] = { saddr, 0 };
1352 int ikeys[2] = { dev->ifindex, 0 };
1353 struct netevent_redirect netevent;
1360 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1361 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1362 ipv4_is_zeronet(new_gw))
1363 goto reject_redirect;
1365 if (!rt_caching(net))
1366 goto reject_redirect;
1368 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1369 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1370 goto reject_redirect;
1371 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1372 goto reject_redirect;
1374 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1375 goto reject_redirect;
1378 for (i = 0; i < 2; i++) {
1379 for (k = 0; k < 2; k++) {
1380 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1383 rthp=&rt_hash_table[hash].chain;
1385 while ((rth = rcu_dereference(*rthp)) != NULL) {
1388 if (rth->fl.fl4_dst != daddr ||
1389 rth->fl.fl4_src != skeys[i] ||
1390 rth->fl.oif != ikeys[k] ||
1392 rt_is_expired(rth) ||
1393 !net_eq(dev_net(rth->dst.dev), net)) {
1394 rthp = &rth->dst.rt_next;
1398 if (rth->rt_dst != daddr ||
1399 rth->rt_src != saddr ||
1401 rth->rt_gateway != old_gw ||
1402 rth->dst.dev != dev)
1405 dst_hold(&rth->dst);
1407 rt = dst_alloc(&ipv4_dst_ops);
1413 /* Copy all the information. */
1416 atomic_set(&rt->dst.__refcnt, 1);
1417 rt->dst.child = NULL;
1419 dev_hold(rt->dst.dev);
1421 in_dev_hold(rt->idev);
1422 rt->dst.obsolete = -1;
1423 rt->dst.lastuse = jiffies;
1424 rt->dst.path = &rt->dst;
1425 rt->dst.neighbour = NULL;
1428 rt->dst.xfrm = NULL;
1430 rt->rt_genid = rt_genid(net);
1431 rt->rt_flags |= RTCF_REDIRECTED;
1433 /* Gateway is different ... */
1434 rt->rt_gateway = new_gw;
1436 /* Redirect received -> path was valid */
1437 dst_confirm(&rth->dst);
1440 atomic_inc(&rt->peer->refcnt);
1442 if (arp_bind_neighbour(&rt->dst) ||
1443 !(rt->dst.neighbour->nud_state &
1445 if (rt->dst.neighbour)
1446 neigh_event_send(rt->dst.neighbour, NULL);
1452 netevent.old = &rth->dst;
1453 netevent.new = &rt->dst;
1454 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1469 #ifdef CONFIG_IP_ROUTE_VERBOSE
1470 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472 " Advised path = %pI4 -> %pI4\n",
1473 &old_gw, dev->name, &new_gw,
1479 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1481 struct rtable *rt = (struct rtable *)dst;
1482 struct dst_entry *ret = dst;
1485 if (dst->obsolete > 0) {
1488 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1490 time_after_eq(jiffies, rt->dst.expires))) {
1491 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1493 rt_genid(dev_net(dst->dev)));
1494 #if RT_CACHE_DEBUG >= 1
1495 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1496 &rt->rt_dst, rt->fl.fl4_tos);
1507 * 1. The first ip_rt_redirect_number redirects are sent
1508 * with exponential backoff, then we stop sending them at all,
1509 * assuming that the host ignores our redirects.
1510 * 2. If we did not see packets requiring redirects
1511 * during ip_rt_redirect_silence, we assume that the host
1512 * forgot redirected route and start to send redirects again.
1514 * This algorithm is much cheaper and more intelligent than dumb load limiting
1517 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1518 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1521 void ip_rt_send_redirect(struct sk_buff *skb)
1523 struct rtable *rt = skb_rtable(skb);
1524 struct in_device *in_dev;
1528 in_dev = __in_dev_get_rcu(rt->dst.dev);
1529 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1533 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1536 /* No redirected packets during ip_rt_redirect_silence;
1537 * reset the algorithm.
1539 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1540 rt->dst.rate_tokens = 0;
1542 /* Too many ignored redirects; do not send anything
1543 * set dst.rate_last to the last seen redirected packet.
1545 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1546 rt->dst.rate_last = jiffies;
1550 /* Check for load limit; set rate_last to the latest sent
1553 if (rt->dst.rate_tokens == 0 ||
1555 (rt->dst.rate_last +
1556 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1557 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1558 rt->dst.rate_last = jiffies;
1559 ++rt->dst.rate_tokens;
1560 #ifdef CONFIG_IP_ROUTE_VERBOSE
1562 rt->dst.rate_tokens == ip_rt_redirect_number &&
1564 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1565 &rt->rt_src, rt->rt_iif,
1566 &rt->rt_dst, &rt->rt_gateway);
1571 static int ip_error(struct sk_buff *skb)
1573 struct rtable *rt = skb_rtable(skb);
1577 switch (rt->dst.error) {
1582 code = ICMP_HOST_UNREACH;
1585 code = ICMP_NET_UNREACH;
1586 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1587 IPSTATS_MIB_INNOROUTES);
1590 code = ICMP_PKT_FILTERED;
1595 rt->dst.rate_tokens += now - rt->dst.rate_last;
1596 if (rt->dst.rate_tokens > ip_rt_error_burst)
1597 rt->dst.rate_tokens = ip_rt_error_burst;
1598 rt->dst.rate_last = now;
1599 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1600 rt->dst.rate_tokens -= ip_rt_error_cost;
1601 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1604 out: kfree_skb(skb);
1609 * The last two values are not from the RFC but
1610 * are needed for AMPRnet AX.25 paths.
1613 static const unsigned short mtu_plateau[] =
1614 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1616 static inline unsigned short guess_mtu(unsigned short old_mtu)
1620 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1621 if (old_mtu > mtu_plateau[i])
1622 return mtu_plateau[i];
1626 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1627 unsigned short new_mtu,
1628 struct net_device *dev)
1631 unsigned short old_mtu = ntohs(iph->tot_len);
1633 int ikeys[2] = { dev->ifindex, 0 };
1634 __be32 skeys[2] = { iph->saddr, 0, };
1635 __be32 daddr = iph->daddr;
1636 unsigned short est_mtu = 0;
1638 for (k = 0; k < 2; k++) {
1639 for (i = 0; i < 2; i++) {
1640 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1644 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1645 rth = rcu_dereference(rth->dst.rt_next)) {
1646 unsigned short mtu = new_mtu;
1648 if (rth->fl.fl4_dst != daddr ||
1649 rth->fl.fl4_src != skeys[i] ||
1650 rth->rt_dst != daddr ||
1651 rth->rt_src != iph->saddr ||
1652 rth->fl.oif != ikeys[k] ||
1654 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1655 !net_eq(dev_net(rth->dst.dev), net) ||
1659 if (new_mtu < 68 || new_mtu >= old_mtu) {
1661 /* BSD 4.2 compatibility hack :-( */
1663 old_mtu >= dst_mtu(&rth->dst) &&
1664 old_mtu >= 68 + (iph->ihl << 2))
1665 old_mtu -= iph->ihl << 2;
1667 mtu = guess_mtu(old_mtu);
1669 if (mtu <= dst_mtu(&rth->dst)) {
1670 if (mtu < dst_mtu(&rth->dst)) {
1671 dst_confirm(&rth->dst);
1672 if (mtu < ip_rt_min_pmtu) {
1673 mtu = ip_rt_min_pmtu;
1674 rth->dst.metrics[RTAX_LOCK-1] |=
1677 rth->dst.metrics[RTAX_MTU-1] = mtu;
1678 dst_set_expires(&rth->dst,
1687 return est_mtu ? : new_mtu;
1690 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1692 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1693 !(dst_metric_locked(dst, RTAX_MTU))) {
1694 if (mtu < ip_rt_min_pmtu) {
1695 mtu = ip_rt_min_pmtu;
1696 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1698 dst->metrics[RTAX_MTU-1] = mtu;
1699 dst_set_expires(dst, ip_rt_mtu_expires);
1700 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1704 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1706 if (rt_is_expired((struct rtable *)dst))
1711 static void ipv4_dst_destroy(struct dst_entry *dst)
1713 struct rtable *rt = (struct rtable *) dst;
1714 struct inet_peer *peer = rt->peer;
1715 struct in_device *idev = rt->idev;
1728 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1731 struct rtable *rt = (struct rtable *) dst;
1732 struct in_device *idev = rt->idev;
1733 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1734 struct in_device *loopback_idev =
1735 in_dev_get(dev_net(dev)->loopback_dev);
1736 if (loopback_idev) {
1737 rt->idev = loopback_idev;
1743 static void ipv4_link_failure(struct sk_buff *skb)
1747 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1749 rt = skb_rtable(skb);
1751 dst_set_expires(&rt->dst, 0);
1754 static int ip_rt_bug(struct sk_buff *skb)
1756 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1757 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1758 skb->dev ? skb->dev->name : "?");
1764 We do not cache source address of outgoing interface,
1765 because it is used only by IP RR, TS and SRR options,
1766 so that it out of fast path.
1768 BTW remember: "addr" is allowed to be not aligned
1772 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 struct fib_result res;
1777 if (rt->fl.iif == 0)
1781 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1782 src = FIB_RES_PREFSRC(res);
1784 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1788 memcpy(addr, &src, 4);
1791 #ifdef CONFIG_NET_CLS_ROUTE
1792 static void set_class_tag(struct rtable *rt, u32 tag)
1794 if (!(rt->dst.tclassid & 0xFFFF))
1795 rt->dst.tclassid |= tag & 0xFFFF;
1796 if (!(rt->dst.tclassid & 0xFFFF0000))
1797 rt->dst.tclassid |= tag & 0xFFFF0000;
1801 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1803 struct fib_info *fi = res->fi;
1806 if (FIB_RES_GW(*res) &&
1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808 rt->rt_gateway = FIB_RES_GW(*res);
1809 memcpy(rt->dst.metrics, fi->fib_metrics,
1810 sizeof(rt->dst.metrics));
1811 if (fi->fib_mtu == 0) {
1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814 rt->rt_gateway != rt->rt_dst &&
1815 rt->dst.dev->mtu > 576)
1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 #ifdef CONFIG_IP_MULTIPLE_TABLES
1836 set_class_tag(rt, fib_rules_tclass(res));
1838 set_class_tag(rt, itag);
1840 rt->rt_type = res->type;
1843 /* called in rcu_read_lock() section */
1844 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1845 u8 tos, struct net_device *dev, int our)
1850 struct in_device *in_dev = __in_dev_get_rcu(dev);
1854 /* Primary sanity checks. */
1859 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1860 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1863 if (ipv4_is_zeronet(saddr)) {
1864 if (!ipv4_is_local_multicast(daddr))
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1873 rth = dst_alloc(&ipv4_dst_ops);
1877 rth->dst.output = ip_rt_bug;
1878 rth->dst.obsolete = -1;
1880 atomic_set(&rth->dst.__refcnt, 1);
1881 rth->dst.flags= DST_HOST;
1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883 rth->dst.flags |= DST_NOPOLICY;
1884 rth->fl.fl4_dst = daddr;
1885 rth->rt_dst = daddr;
1886 rth->fl.fl4_tos = tos;
1887 rth->fl.mark = skb->mark;
1888 rth->fl.fl4_src = saddr;
1889 rth->rt_src = saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891 rth->dst.tclassid = itag;
1894 rth->fl.iif = dev->ifindex;
1895 rth->dst.dev = init_net.loopback_dev;
1896 dev_hold(rth->dst.dev);
1897 rth->idev = in_dev_get(rth->dst.dev);
1899 rth->rt_gateway = daddr;
1900 rth->rt_spec_dst= spec_dst;
1901 rth->rt_genid = rt_genid(dev_net(dev));
1902 rth->rt_flags = RTCF_MULTICAST;
1903 rth->rt_type = RTN_MULTICAST;
1905 rth->dst.input= ip_local_deliver;
1906 rth->rt_flags |= RTCF_LOCAL;
1909 #ifdef CONFIG_IP_MROUTE
1910 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911 rth->dst.input = ip_mr_input;
1913 RT_CACHE_STAT_INC(in_slow_mc);
1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1927 static void ip_handle_martian_source(struct net_device *dev,
1928 struct in_device *in_dev,
1929 struct sk_buff *skb,
1933 RT_CACHE_STAT_INC(in_martian_src);
1934 #ifdef CONFIG_IP_ROUTE_VERBOSE
1935 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1937 * RFC1812 recommendation, if source is martian,
1938 * the only hint is MAC header.
1940 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1941 &daddr, &saddr, dev->name);
1942 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1944 const unsigned char *p = skb_mac_header(skb);
1945 printk(KERN_WARNING "ll header: ");
1946 for (i = 0; i < dev->hard_header_len; i++, p++) {
1948 if (i < (dev->hard_header_len - 1))
1957 /* called in rcu_read_lock() section */
1958 static int __mkroute_input(struct sk_buff *skb,
1959 struct fib_result *res,
1960 struct in_device *in_dev,
1961 __be32 daddr, __be32 saddr, u32 tos,
1962 struct rtable **result)
1966 struct in_device *out_dev;
1967 unsigned int flags = 0;
1971 /* get a working reference to the output device */
1972 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1973 if (out_dev == NULL) {
1974 if (net_ratelimit())
1975 printk(KERN_CRIT "Bug in ip_route_input" \
1976 "_slow(). Please, report\n");
1981 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1982 in_dev->dev, &spec_dst, &itag, skb->mark);
1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1991 flags |= RTCF_DIRECTSRC;
1993 if (out_dev == in_dev && err &&
1994 (IN_DEV_SHARED_MEDIA(out_dev) ||
1995 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1996 flags |= RTCF_DOREDIRECT;
1998 if (skb->protocol != htons(ETH_P_IP)) {
1999 /* Not IP (i.e. ARP). Do not create route, if it is
2000 * invalid for proxy arp. DNAT routes are always valid.
2002 * Proxy arp feature have been extended to allow, ARP
2003 * replies back to the same interface, to support
2004 * Private VLAN switch technologies. See arp.c.
2006 if (out_dev == in_dev &&
2007 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2014 rth = dst_alloc(&ipv4_dst_ops);
2020 atomic_set(&rth->dst.__refcnt, 1);
2021 rth->dst.flags= DST_HOST;
2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2023 rth->dst.flags |= DST_NOPOLICY;
2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2025 rth->dst.flags |= DST_NOXFRM;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
2029 rth->fl.mark = skb->mark;
2030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr;
2032 rth->rt_gateway = daddr;
2034 rth->fl.iif = in_dev->dev->ifindex;
2035 rth->dst.dev = (out_dev)->dev;
2036 dev_hold(rth->dst.dev);
2037 rth->idev = in_dev_get(rth->dst.dev);
2039 rth->rt_spec_dst= spec_dst;
2041 rth->dst.obsolete = -1;
2042 rth->dst.input = ip_forward;
2043 rth->dst.output = ip_output;
2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2046 rt_set_nexthop(rth, res, itag);
2048 rth->rt_flags = flags;
2056 static int ip_mkroute_input(struct sk_buff *skb,
2057 struct fib_result *res,
2058 const struct flowi *fl,
2059 struct in_device *in_dev,
2060 __be32 daddr, __be32 saddr, u32 tos)
2062 struct rtable* rth = NULL;
2066 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2067 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2068 fib_select_multipath(fl, res);
2071 /* create a routing cache entry */
2072 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2076 /* put it into the cache */
2077 hash = rt_hash(daddr, saddr, fl->iif,
2078 rt_genid(dev_net(rth->dst.dev)));
2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2083 * NOTE. We drop all the packets that has local source
2084 * addresses, because every properly looped back packet
2085 * must have correct destination already attached by output routine.
2087 * Such approach solves two big problems:
2088 * 1. Not simplex devices are handled properly.
2089 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2090 * called with rcu_read_lock()
2093 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094 u8 tos, struct net_device *dev)
2096 struct fib_result res;
2097 struct in_device *in_dev = __in_dev_get_rcu(dev);
2098 struct flowi fl = { .nl_u = { .ip4_u =
2102 .scope = RT_SCOPE_UNIVERSE,
2105 .iif = dev->ifindex };
2108 struct rtable * rth;
2112 struct net * net = dev_net(dev);
2114 /* IP on this device is disabled. */
2119 /* Check for the most weird martians, which can be not detected
2123 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2124 ipv4_is_loopback(saddr))
2125 goto martian_source;
2127 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2130 /* Accept zero addresses only to limited broadcast;
2131 * I even do not know to fix it or not. Waiting for complains :-)
2133 if (ipv4_is_zeronet(saddr))
2134 goto martian_source;
2136 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2137 ipv4_is_loopback(daddr))
2138 goto martian_destination;
2141 * Now we are ready to route packet.
2143 err = fib_lookup(net, &fl, &res);
2145 if (!IN_DEV_FORWARD(in_dev))
2150 RT_CACHE_STAT_INC(in_slow_tot);
2152 if (res.type == RTN_BROADCAST)
2155 if (res.type == RTN_LOCAL) {
2156 err = fib_validate_source(saddr, daddr, tos,
2157 net->loopback_dev->ifindex,
2158 dev, &spec_dst, &itag, skb->mark);
2160 goto martian_source_keep_err;
2162 flags |= RTCF_DIRECTSRC;
2167 if (!IN_DEV_FORWARD(in_dev))
2169 if (res.type != RTN_UNICAST)
2170 goto martian_destination;
2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2176 if (skb->protocol != htons(ETH_P_IP))
2179 if (ipv4_is_zeronet(saddr))
2180 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2182 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2185 goto martian_source_keep_err;
2187 flags |= RTCF_DIRECTSRC;
2189 flags |= RTCF_BROADCAST;
2190 res.type = RTN_BROADCAST;
2191 RT_CACHE_STAT_INC(in_brd);
2194 rth = dst_alloc(&ipv4_dst_ops);
2198 rth->dst.output= ip_rt_bug;
2199 rth->dst.obsolete = -1;
2200 rth->rt_genid = rt_genid(net);
2202 atomic_set(&rth->dst.__refcnt, 1);
2203 rth->dst.flags= DST_HOST;
2204 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2205 rth->dst.flags |= DST_NOPOLICY;
2206 rth->fl.fl4_dst = daddr;
2207 rth->rt_dst = daddr;
2208 rth->fl.fl4_tos = tos;
2209 rth->fl.mark = skb->mark;
2210 rth->fl.fl4_src = saddr;
2211 rth->rt_src = saddr;
2212 #ifdef CONFIG_NET_CLS_ROUTE
2213 rth->dst.tclassid = itag;
2216 rth->fl.iif = dev->ifindex;
2217 rth->dst.dev = net->loopback_dev;
2218 dev_hold(rth->dst.dev);
2219 rth->idev = in_dev_get(rth->dst.dev);
2220 rth->rt_gateway = daddr;
2221 rth->rt_spec_dst= spec_dst;
2222 rth->dst.input= ip_local_deliver;
2223 rth->rt_flags = flags|RTCF_LOCAL;
2224 if (res.type == RTN_UNREACHABLE) {
2225 rth->dst.input= ip_error;
2226 rth->dst.error= -err;
2227 rth->rt_flags &= ~RTCF_LOCAL;
2229 rth->rt_type = res.type;
2230 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2231 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2235 RT_CACHE_STAT_INC(in_no_route);
2236 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2237 res.type = RTN_UNREACHABLE;
2243 * Do not cache martian addresses: they should be logged (RFC1812)
2245 martian_destination:
2246 RT_CACHE_STAT_INC(in_martian_dst);
2247 #ifdef CONFIG_IP_ROUTE_VERBOSE
2248 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2249 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2250 &daddr, &saddr, dev->name);
2254 err = -EHOSTUNREACH;
2267 martian_source_keep_err:
2268 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2273 u8 tos, struct net_device *dev, bool noref)
2275 struct rtable * rth;
2277 int iif = dev->ifindex;
2285 if (!rt_caching(net))
2288 tos &= IPTOS_RT_MASK;
2289 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2291 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2292 rth = rcu_dereference(rth->dst.rt_next)) {
2293 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2294 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2295 (rth->fl.iif ^ iif) |
2297 (rth->fl.fl4_tos ^ tos)) == 0 &&
2298 rth->fl.mark == skb->mark &&
2299 net_eq(dev_net(rth->dst.dev), net) &&
2300 !rt_is_expired(rth)) {
2302 dst_use_noref(&rth->dst, jiffies);
2303 skb_dst_set_noref(skb, &rth->dst);
2305 dst_use(&rth->dst, jiffies);
2306 skb_dst_set(skb, &rth->dst);
2308 RT_CACHE_STAT_INC(in_hit);
2312 RT_CACHE_STAT_INC(in_hlist_search);
2316 /* Multicast recognition logic is moved from route cache to here.
2317 The problem was that too many Ethernet cards have broken/missing
2318 hardware multicast filters :-( As result the host on multicasting
2319 network acquires a lot of useless route cache entries, sort of
2320 SDR messages from all the world. Now we try to get rid of them.
2321 Really, provided software IP multicast filter is organized
2322 reasonably (at least, hashed), it does not result in a slowdown
2323 comparing with route cache reject entries.
2324 Note, that multicast routers are not affected, because
2325 route cache entry is created eventually.
2327 if (ipv4_is_multicast(daddr)) {
2328 struct in_device *in_dev = __in_dev_get_rcu(dev);
2331 int our = ip_check_mc(in_dev, daddr, saddr,
2332 ip_hdr(skb)->protocol);
2334 #ifdef CONFIG_IP_MROUTE
2336 (!ipv4_is_local_multicast(daddr) &&
2337 IN_DEV_MFORWARD(in_dev))
2340 int res = ip_route_input_mc(skb, daddr, saddr,
2349 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 EXPORT_SYMBOL(ip_route_input_common);
2355 /* called with rcu_read_lock() */
2356 static int __mkroute_output(struct rtable **result,
2357 struct fib_result *res,
2358 const struct flowi *fl,
2359 const struct flowi *oldflp,
2360 struct net_device *dev_out,
2364 struct in_device *in_dev;
2365 u32 tos = RT_FL_TOS(oldflp);
2367 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2370 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2371 res->type = RTN_BROADCAST;
2372 else if (ipv4_is_multicast(fl->fl4_dst))
2373 res->type = RTN_MULTICAST;
2374 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2377 if (dev_out->flags & IFF_LOOPBACK)
2378 flags |= RTCF_LOCAL;
2380 in_dev = __in_dev_get_rcu(dev_out);
2384 if (res->type == RTN_BROADCAST) {
2385 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2387 } else if (res->type == RTN_MULTICAST) {
2388 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2389 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2391 flags &= ~RTCF_LOCAL;
2392 /* If multicast route do not exist use
2393 * default one, but do not gateway in this case.
2396 if (res->fi && res->prefixlen < 4)
2401 rth = dst_alloc(&ipv4_dst_ops);
2405 in_dev_hold(in_dev);
2408 atomic_set(&rth->dst.__refcnt, 1);
2409 rth->dst.flags= DST_HOST;
2410 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2411 rth->dst.flags |= DST_NOXFRM;
2412 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2413 rth->dst.flags |= DST_NOPOLICY;
2415 rth->fl.fl4_dst = oldflp->fl4_dst;
2416 rth->fl.fl4_tos = tos;
2417 rth->fl.fl4_src = oldflp->fl4_src;
2418 rth->fl.oif = oldflp->oif;
2419 rth->fl.mark = oldflp->mark;
2420 rth->rt_dst = fl->fl4_dst;
2421 rth->rt_src = fl->fl4_src;
2422 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2423 /* get references to the devices that are to be hold by the routing
2425 rth->dst.dev = dev_out;
2427 rth->rt_gateway = fl->fl4_dst;
2428 rth->rt_spec_dst= fl->fl4_src;
2430 rth->dst.output=ip_output;
2431 rth->dst.obsolete = -1;
2432 rth->rt_genid = rt_genid(dev_net(dev_out));
2434 RT_CACHE_STAT_INC(out_slow_tot);
2436 if (flags & RTCF_LOCAL) {
2437 rth->dst.input = ip_local_deliver;
2438 rth->rt_spec_dst = fl->fl4_dst;
2440 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2441 rth->rt_spec_dst = fl->fl4_src;
2442 if (flags & RTCF_LOCAL &&
2443 !(dev_out->flags & IFF_LOOPBACK)) {
2444 rth->dst.output = ip_mc_output;
2445 RT_CACHE_STAT_INC(out_slow_mc);
2447 #ifdef CONFIG_IP_MROUTE
2448 if (res->type == RTN_MULTICAST) {
2449 if (IN_DEV_MFORWARD(in_dev) &&
2450 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2451 rth->dst.input = ip_mr_input;
2452 rth->dst.output = ip_mc_output;
2458 rt_set_nexthop(rth, res, 0);
2460 rth->rt_flags = flags;
2465 /* called with rcu_read_lock() */
2466 static int ip_mkroute_output(struct rtable **rp,
2467 struct fib_result *res,
2468 const struct flowi *fl,
2469 const struct flowi *oldflp,
2470 struct net_device *dev_out,
2473 struct rtable *rth = NULL;
2474 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2477 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2478 rt_genid(dev_net(dev_out)));
2479 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2486 * Major route resolver routine.
2487 * called with rcu_read_lock();
2490 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2491 const struct flowi *oldflp)
2493 u32 tos = RT_FL_TOS(oldflp);
2494 struct flowi fl = { .nl_u = { .ip4_u =
2495 { .daddr = oldflp->fl4_dst,
2496 .saddr = oldflp->fl4_src,
2497 .tos = tos & IPTOS_RT_MASK,
2498 .scope = ((tos & RTO_ONLINK) ?
2502 .mark = oldflp->mark,
2503 .iif = net->loopback_dev->ifindex,
2504 .oif = oldflp->oif };
2505 struct fib_result res;
2506 unsigned int flags = 0;
2507 struct net_device *dev_out = NULL;
2512 #ifdef CONFIG_IP_MULTIPLE_TABLES
2516 if (oldflp->fl4_src) {
2518 if (ipv4_is_multicast(oldflp->fl4_src) ||
2519 ipv4_is_lbcast(oldflp->fl4_src) ||
2520 ipv4_is_zeronet(oldflp->fl4_src))
2523 /* I removed check for oif == dev_out->oif here.
2524 It was wrong for two reasons:
2525 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2526 is assigned to multiple interfaces.
2527 2. Moreover, we are allowed to send packets with saddr
2528 of another iface. --ANK
2531 if (oldflp->oif == 0 &&
2532 (ipv4_is_multicast(oldflp->fl4_dst) ||
2533 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2534 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2535 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2536 if (dev_out == NULL)
2539 /* Special hack: user can direct multicasts
2540 and limited broadcast via necessary interface
2541 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2542 This hack is not just for fun, it allows
2543 vic,vat and friends to work.
2544 They bind socket to loopback, set ttl to zero
2545 and expect that it will work.
2546 From the viewpoint of routing cache they are broken,
2547 because we are not allowed to build multicast path
2548 with loopback source addr (look, routing cache
2549 cannot know, that ttl is zero, so that packet
2550 will not leave this host and route is valid).
2551 Luckily, this hack is good workaround.
2554 fl.oif = dev_out->ifindex;
2558 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2559 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2560 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2567 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2569 if (dev_out == NULL)
2572 /* RACE: Check return value of inet_select_addr instead. */
2573 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2574 goto out; /* Wrong error code */
2576 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2577 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2579 fl.fl4_src = inet_select_addr(dev_out, 0,
2584 if (ipv4_is_multicast(oldflp->fl4_dst))
2585 fl.fl4_src = inet_select_addr(dev_out, 0,
2587 else if (!oldflp->fl4_dst)
2588 fl.fl4_src = inet_select_addr(dev_out, 0,
2594 fl.fl4_dst = fl.fl4_src;
2596 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2597 dev_out = net->loopback_dev;
2598 fl.oif = net->loopback_dev->ifindex;
2599 res.type = RTN_LOCAL;
2600 flags |= RTCF_LOCAL;
2604 if (fib_lookup(net, &fl, &res)) {
2607 /* Apparently, routing tables are wrong. Assume,
2608 that the destination is on link.
2611 Because we are allowed to send to iface
2612 even if it has NO routes and NO assigned
2613 addresses. When oif is specified, routing
2614 tables are looked up with only one purpose:
2615 to catch if destination is gatewayed, rather than
2616 direct. Moreover, if MSG_DONTROUTE is set,
2617 we send packet, ignoring both routing tables
2618 and ifaddr state. --ANK
2621 We could make it even if oif is unknown,
2622 likely IPv6, but we do not.
2625 if (fl.fl4_src == 0)
2626 fl.fl4_src = inet_select_addr(dev_out, 0,
2628 res.type = RTN_UNICAST;
2635 if (res.type == RTN_LOCAL) {
2637 fl.fl4_src = fl.fl4_dst;
2638 dev_out = net->loopback_dev;
2639 fl.oif = dev_out->ifindex;
2641 flags |= RTCF_LOCAL;
2645 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2646 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2647 fib_select_multipath(&fl, &res);
2650 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2651 fib_select_default(net, &fl, &res);
2654 fl.fl4_src = FIB_RES_PREFSRC(res);
2656 dev_out = FIB_RES_DEV(res);
2657 fl.oif = dev_out->ifindex;
2661 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2666 int __ip_route_output_key(struct net *net, struct rtable **rp,
2667 const struct flowi *flp)
2673 if (!rt_caching(net))
2676 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2679 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2680 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2681 if (rth->fl.fl4_dst == flp->fl4_dst &&
2682 rth->fl.fl4_src == flp->fl4_src &&
2684 rth->fl.oif == flp->oif &&
2685 rth->fl.mark == flp->mark &&
2686 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2687 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2688 net_eq(dev_net(rth->dst.dev), net) &&
2689 !rt_is_expired(rth)) {
2690 dst_use(&rth->dst, jiffies);
2691 RT_CACHE_STAT_INC(out_hit);
2692 rcu_read_unlock_bh();
2696 RT_CACHE_STAT_INC(out_hlist_search);
2698 rcu_read_unlock_bh();
2702 res = ip_route_output_slow(net, rp, flp);
2706 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2708 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2713 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2717 static struct dst_ops ipv4_dst_blackhole_ops = {
2719 .protocol = cpu_to_be16(ETH_P_IP),
2720 .destroy = ipv4_dst_destroy,
2721 .check = ipv4_blackhole_dst_check,
2722 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2726 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2728 struct rtable *ort = *rp;
2729 struct rtable *rt = (struct rtable *)
2730 dst_alloc(&ipv4_dst_blackhole_ops);
2733 struct dst_entry *new = &rt->dst;
2735 atomic_set(&new->__refcnt, 1);
2737 new->input = dst_discard;
2738 new->output = dst_discard;
2739 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2741 new->dev = ort->dst.dev;
2747 rt->idev = ort->idev;
2749 in_dev_hold(rt->idev);
2750 rt->rt_genid = rt_genid(net);
2751 rt->rt_flags = ort->rt_flags;
2752 rt->rt_type = ort->rt_type;
2753 rt->rt_dst = ort->rt_dst;
2754 rt->rt_src = ort->rt_src;
2755 rt->rt_iif = ort->rt_iif;
2756 rt->rt_gateway = ort->rt_gateway;
2757 rt->rt_spec_dst = ort->rt_spec_dst;
2758 rt->peer = ort->peer;
2760 atomic_inc(&rt->peer->refcnt);
2765 dst_release(&(*rp)->dst);
2767 return rt ? 0 : -ENOMEM;
2770 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2771 struct sock *sk, int flags)
2775 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2780 flp->fl4_src = (*rp)->rt_src;
2782 flp->fl4_dst = (*rp)->rt_dst;
2783 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2784 flags ? XFRM_LOOKUP_WAIT : 0);
2785 if (err == -EREMOTE)
2786 err = ipv4_dst_blackhole(net, rp, flp);
2793 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2795 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2797 return ip_route_output_flow(net, rp, flp, NULL, 0);
2799 EXPORT_SYMBOL(ip_route_output_key);
2801 static int rt_fill_info(struct net *net,
2802 struct sk_buff *skb, u32 pid, u32 seq, int event,
2803 int nowait, unsigned int flags)
2805 struct rtable *rt = skb_rtable(skb);
2807 struct nlmsghdr *nlh;
2809 u32 id = 0, ts = 0, tsage = 0, error;
2811 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2815 r = nlmsg_data(nlh);
2816 r->rtm_family = AF_INET;
2817 r->rtm_dst_len = 32;
2819 r->rtm_tos = rt->fl.fl4_tos;
2820 r->rtm_table = RT_TABLE_MAIN;
2821 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2822 r->rtm_type = rt->rt_type;
2823 r->rtm_scope = RT_SCOPE_UNIVERSE;
2824 r->rtm_protocol = RTPROT_UNSPEC;
2825 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2826 if (rt->rt_flags & RTCF_NOTIFY)
2827 r->rtm_flags |= RTM_F_NOTIFY;
2829 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831 if (rt->fl.fl4_src) {
2832 r->rtm_src_len = 32;
2833 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2836 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2837 #ifdef CONFIG_NET_CLS_ROUTE
2838 if (rt->dst.tclassid)
2839 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2842 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2843 else if (rt->rt_src != rt->fl.fl4_src)
2844 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846 if (rt->rt_dst != rt->rt_gateway)
2847 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2850 goto nla_put_failure;
2853 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2855 error = rt->dst.error;
2856 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2858 inet_peer_refcheck(rt->peer);
2859 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2860 if (rt->peer->tcp_ts_stamp) {
2861 ts = rt->peer->tcp_ts;
2862 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2867 #ifdef CONFIG_IP_MROUTE
2868 __be32 dst = rt->rt_dst;
2870 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2871 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2872 int err = ipmr_get_route(net, skb, r, nowait);
2877 goto nla_put_failure;
2879 if (err == -EMSGSIZE)
2880 goto nla_put_failure;
2886 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2889 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2890 expires, error) < 0)
2891 goto nla_put_failure;
2893 return nlmsg_end(skb, nlh);
2896 nlmsg_cancel(skb, nlh);
2900 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2902 struct net *net = sock_net(in_skb->sk);
2904 struct nlattr *tb[RTA_MAX+1];
2905 struct rtable *rt = NULL;
2911 struct sk_buff *skb;
2913 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2917 rtm = nlmsg_data(nlh);
2919 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2925 /* Reserve room for dummy headers, this skb can pass
2926 through good chunk of routing engine.
2928 skb_reset_mac_header(skb);
2929 skb_reset_network_header(skb);
2931 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2932 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2933 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2935 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2936 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2937 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2938 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2941 struct net_device *dev;
2943 dev = __dev_get_by_index(net, iif);
2949 skb->protocol = htons(ETH_P_IP);
2953 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2956 rt = skb_rtable(skb);
2957 if (err == 0 && rt->dst.error)
2958 err = -rt->dst.error;
2965 .tos = rtm->rtm_tos,
2968 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2971 err = ip_route_output_key(net, &rt, &fl);
2977 skb_dst_set(skb, &rt->dst);
2978 if (rtm->rtm_flags & RTM_F_NOTIFY)
2979 rt->rt_flags |= RTCF_NOTIFY;
2981 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2982 RTM_NEWROUTE, 0, 0);
2986 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2995 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3002 net = sock_net(skb->sk);
3007 s_idx = idx = cb->args[1];
3008 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3009 if (!rt_hash_table[h].chain)
3012 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3013 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3014 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3016 if (rt_is_expired(rt))
3018 skb_dst_set_noref(skb, &rt->dst);
3019 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3020 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3021 1, NLM_F_MULTI) <= 0) {
3023 rcu_read_unlock_bh();
3028 rcu_read_unlock_bh();
3037 void ip_rt_multicast_event(struct in_device *in_dev)
3039 rt_cache_flush(dev_net(in_dev->dev), 0);
3042 #ifdef CONFIG_SYSCTL
3043 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3044 void __user *buffer,
3045 size_t *lenp, loff_t *ppos)
3052 memcpy(&ctl, __ctl, sizeof(ctl));
3053 ctl.data = &flush_delay;
3054 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3056 net = (struct net *)__ctl->extra1;
3057 rt_cache_flush(net, flush_delay);
3064 static ctl_table ipv4_route_table[] = {
3066 .procname = "gc_thresh",
3067 .data = &ipv4_dst_ops.gc_thresh,
3068 .maxlen = sizeof(int),
3070 .proc_handler = proc_dointvec,
3073 .procname = "max_size",
3074 .data = &ip_rt_max_size,
3075 .maxlen = sizeof(int),
3077 .proc_handler = proc_dointvec,
3080 /* Deprecated. Use gc_min_interval_ms */
3082 .procname = "gc_min_interval",
3083 .data = &ip_rt_gc_min_interval,
3084 .maxlen = sizeof(int),
3086 .proc_handler = proc_dointvec_jiffies,
3089 .procname = "gc_min_interval_ms",
3090 .data = &ip_rt_gc_min_interval,
3091 .maxlen = sizeof(int),
3093 .proc_handler = proc_dointvec_ms_jiffies,
3096 .procname = "gc_timeout",
3097 .data = &ip_rt_gc_timeout,
3098 .maxlen = sizeof(int),
3100 .proc_handler = proc_dointvec_jiffies,
3103 .procname = "gc_interval",
3104 .data = &ip_rt_gc_interval,
3105 .maxlen = sizeof(int),
3107 .proc_handler = proc_dointvec_jiffies,
3110 .procname = "redirect_load",
3111 .data = &ip_rt_redirect_load,
3112 .maxlen = sizeof(int),
3114 .proc_handler = proc_dointvec,
3117 .procname = "redirect_number",
3118 .data = &ip_rt_redirect_number,
3119 .maxlen = sizeof(int),
3121 .proc_handler = proc_dointvec,
3124 .procname = "redirect_silence",
3125 .data = &ip_rt_redirect_silence,
3126 .maxlen = sizeof(int),
3128 .proc_handler = proc_dointvec,
3131 .procname = "error_cost",
3132 .data = &ip_rt_error_cost,
3133 .maxlen = sizeof(int),
3135 .proc_handler = proc_dointvec,
3138 .procname = "error_burst",
3139 .data = &ip_rt_error_burst,
3140 .maxlen = sizeof(int),
3142 .proc_handler = proc_dointvec,
3145 .procname = "gc_elasticity",
3146 .data = &ip_rt_gc_elasticity,
3147 .maxlen = sizeof(int),
3149 .proc_handler = proc_dointvec,
3152 .procname = "mtu_expires",
3153 .data = &ip_rt_mtu_expires,
3154 .maxlen = sizeof(int),
3156 .proc_handler = proc_dointvec_jiffies,
3159 .procname = "min_pmtu",
3160 .data = &ip_rt_min_pmtu,
3161 .maxlen = sizeof(int),
3163 .proc_handler = proc_dointvec,
3166 .procname = "min_adv_mss",
3167 .data = &ip_rt_min_advmss,
3168 .maxlen = sizeof(int),
3170 .proc_handler = proc_dointvec,
3175 static struct ctl_table empty[1];
3177 static struct ctl_table ipv4_skeleton[] =
3179 { .procname = "route",
3180 .mode = 0555, .child = ipv4_route_table},
3181 { .procname = "neigh",
3182 .mode = 0555, .child = empty},
3186 static __net_initdata struct ctl_path ipv4_path[] = {
3187 { .procname = "net", },
3188 { .procname = "ipv4", },
3192 static struct ctl_table ipv4_route_flush_table[] = {
3194 .procname = "flush",
3195 .maxlen = sizeof(int),
3197 .proc_handler = ipv4_sysctl_rtcache_flush,
3202 static __net_initdata struct ctl_path ipv4_route_path[] = {
3203 { .procname = "net", },
3204 { .procname = "ipv4", },
3205 { .procname = "route", },
3209 static __net_init int sysctl_route_net_init(struct net *net)
3211 struct ctl_table *tbl;
3213 tbl = ipv4_route_flush_table;
3214 if (!net_eq(net, &init_net)) {
3215 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3219 tbl[0].extra1 = net;
3221 net->ipv4.route_hdr =
3222 register_net_sysctl_table(net, ipv4_route_path, tbl);
3223 if (net->ipv4.route_hdr == NULL)
3228 if (tbl != ipv4_route_flush_table)
3234 static __net_exit void sysctl_route_net_exit(struct net *net)
3236 struct ctl_table *tbl;
3238 tbl = net->ipv4.route_hdr->ctl_table_arg;
3239 unregister_net_sysctl_table(net->ipv4.route_hdr);
3240 BUG_ON(tbl == ipv4_route_flush_table);
3244 static __net_initdata struct pernet_operations sysctl_route_ops = {
3245 .init = sysctl_route_net_init,
3246 .exit = sysctl_route_net_exit,
3250 static __net_init int rt_genid_init(struct net *net)
3252 get_random_bytes(&net->ipv4.rt_genid,
3253 sizeof(net->ipv4.rt_genid));
3257 static __net_initdata struct pernet_operations rt_genid_ops = {
3258 .init = rt_genid_init,
3262 #ifdef CONFIG_NET_CLS_ROUTE
3263 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3264 #endif /* CONFIG_NET_CLS_ROUTE */
3266 static __initdata unsigned long rhash_entries;
3267 static int __init set_rhash_entries(char *str)
3271 rhash_entries = simple_strtoul(str, &str, 0);
3274 __setup("rhash_entries=", set_rhash_entries);
3276 int __init ip_rt_init(void)
3280 #ifdef CONFIG_NET_CLS_ROUTE
3281 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3283 panic("IP: failed to allocate ip_rt_acct\n");
3286 ipv4_dst_ops.kmem_cachep =
3287 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3288 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3290 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3292 if (dst_entries_init(&ipv4_dst_ops) < 0)
3293 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3295 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3296 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3298 rt_hash_table = (struct rt_hash_bucket *)
3299 alloc_large_system_hash("IP route cache",
3300 sizeof(struct rt_hash_bucket),
3302 (totalram_pages >= 128 * 1024) ?
3307 rhash_entries ? 0 : 512 * 1024);
3308 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3309 rt_hash_lock_init();
3311 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3312 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3317 /* All the timers, started at system startup tend
3318 to synchronize. Perturb it a bit.
3320 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3321 expires_ljiffies = jiffies;
3322 schedule_delayed_work(&expires_work,
3323 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3325 if (ip_rt_proc_init())
3326 printk(KERN_ERR "Unable to create route proc files\n");
3329 xfrm4_init(ip_rt_max_size);
3331 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3333 #ifdef CONFIG_SYSCTL
3334 register_pernet_subsys(&sysctl_route_ops);
3336 register_pernet_subsys(&rt_genid_ops);
3340 #ifdef CONFIG_SYSCTL
3342 * We really need to sanitize the damn ipv4 init order, then all
3343 * this nonsense will go away.
3345 void __init ip_static_sysctl_init(void)
3347 register_sysctl_paths(ipv4_path, ipv4_skeleton);