net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/secure_seq.h>
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 static struct delayed_work expires_work;
 136 static unsigned long expires_ljiffies;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 145 static void              ipv4_dst_destroy(struct dst_entry *dst);
 146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 147 static void              ipv4_link_failure(struct sk_buff *skb);
 148 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 149 static int rt_garbage_collect(struct dst_ops *ops);
 150
 151 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 152                             int how)
 153 {
 154 }
 155
 156 static struct dst_ops ipv4_dst_ops = {
 157         .family =               AF_INET,
 158         .protocol =             cpu_to_be16(ETH_P_IP),
 159         .gc =                   rt_garbage_collect,
 160         .check =                ipv4_dst_check,
 161         .default_advmss =       ipv4_default_advmss,
 162         .default_mtu =          ipv4_default_mtu,
 163         .destroy =              ipv4_dst_destroy,
 164         .ifdown =               ipv4_dst_ifdown,
 165         .negative_advice =      ipv4_negative_advice,
 166         .link_failure =         ipv4_link_failure,
 167         .update_pmtu =          ip_rt_update_pmtu,
 168         .local_out =            __ip_local_out,
 169 };
 170
 171 #define ECN_OR_COST(class)      TC_PRIO_##class
 172
 173 const __u8 ip_tos2prio[16] = {
 174         TC_PRIO_BESTEFFORT,
 175         ECN_OR_COST(BESTEFFORT),
 176         TC_PRIO_BESTEFFORT,
 177         ECN_OR_COST(BESTEFFORT),
 178         TC_PRIO_BULK,
 179         ECN_OR_COST(BULK),
 180         TC_PRIO_BULK,
 181         ECN_OR_COST(BULK),
 182         TC_PRIO_INTERACTIVE,
 183         ECN_OR_COST(INTERACTIVE),
 184         TC_PRIO_INTERACTIVE,
 185         ECN_OR_COST(INTERACTIVE),
 186         TC_PRIO_INTERACTIVE_BULK,
 187         ECN_OR_COST(INTERACTIVE_BULK),
 188         TC_PRIO_INTERACTIVE_BULK,
 189         ECN_OR_COST(INTERACTIVE_BULK)
 190 };
 191
 192
 193 /*
 194  * Route cache.
 195  */
 196
 197 /* The locking scheme is rather straight forward:
 198  *
 199  * 1) Read-Copy Update protects the buckets of the central route hash.
 200  * 2) Only writers remove entries, and they hold the lock
 201  *    as they look at rtable reference counts.
 202  * 3) Only readers acquire references to rtable entries,
 203  *    they do so with atomic increments and with the
 204  *    lock held.
 205  */
 206
 207 struct rt_hash_bucket {
 208         struct rtable __rcu     *chain;
 209 };
 210
 211 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 212         defined(CONFIG_PROVE_LOCKING)
 213 /*
 214  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 215  * The size of this table is a power of two and depends on the number of CPUS.
 216  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 217  */
 218 #ifdef CONFIG_LOCKDEP
 219 # define RT_HASH_LOCK_SZ        256
 220 #else
 221 # if NR_CPUS >= 32
 222 #  define RT_HASH_LOCK_SZ       4096
 223 # elif NR_CPUS >= 16
 224 #  define RT_HASH_LOCK_SZ       2048
 225 # elif NR_CPUS >= 8
 226 #  define RT_HASH_LOCK_SZ       1024
 227 # elif NR_CPUS >= 4
 228 #  define RT_HASH_LOCK_SZ       512
 229 # else
 230 #  define RT_HASH_LOCK_SZ       256
 231 # endif
 232 #endif
 233
 234 static spinlock_t       *rt_hash_locks;
 235 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 236
 237 static __init void rt_hash_lock_init(void)
 238 {
 239         int i;
 240
 241         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 242                         GFP_KERNEL);
 243         if (!rt_hash_locks)
 244                 panic("IP: failed to allocate rt_hash_locks\n");
 245
 246         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 247                 spin_lock_init(&rt_hash_locks[i]);
 248 }
 249 #else
 250 # define rt_hash_lock_addr(slot) NULL
 251
 252 static inline void rt_hash_lock_init(void)
 253 {
 254 }
 255 #endif
 256
 257 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 258 static unsigned                 rt_hash_mask __read_mostly;
 259 static unsigned int             rt_hash_log  __read_mostly;
 260
 261 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 262 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 263
 264 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 265                                    int genid)
 266 {
 267         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 268                             idx, genid)
 269                 & rt_hash_mask;
 270 }
 271
 272 static inline int rt_genid(struct net *net)
 273 {
 274         return atomic_read(&net->ipv4.rt_genid);
 275 }
 276
 277 #ifdef CONFIG_PROC_FS
 278 struct rt_cache_iter_state {
 279         struct seq_net_private p;
 280         int bucket;
 281         int genid;
 282 };
 283
 284 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 285 {
 286         struct rt_cache_iter_state *st = seq->private;
 287         struct rtable *r = NULL;
 288
 289         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 290                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 291                         continue;
 292                 rcu_read_lock_bh();
 293                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 294                 while (r) {
 295                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 296                             r->rt_genid == st->genid)
 297                                 return r;
 298                         r = rcu_dereference_bh(r->dst.rt_next);
 299                 }
 300                 rcu_read_unlock_bh();
 301         }
 302         return r;
 303 }
 304
 305 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 306                                           struct rtable *r)
 307 {
 308         struct rt_cache_iter_state *st = seq->private;
 309
 310         r = rcu_dereference_bh(r->dst.rt_next);
 311         while (!r) {
 312                 rcu_read_unlock_bh();
 313                 do {
 314                         if (--st->bucket < 0)
 315                                 return NULL;
 316                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 317                 rcu_read_lock_bh();
 318                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 319         }
 320         return r;
 321 }
 322
 323 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 324                                         struct rtable *r)
 325 {
 326         struct rt_cache_iter_state *st = seq->private;
 327         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 328                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 329                         continue;
 330                 if (r->rt_genid == st->genid)
 331                         break;
 332         }
 333         return r;
 334 }
 335
 336 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 337 {
 338         struct rtable *r = rt_cache_get_first(seq);
 339
 340         if (r)
 341                 while (pos && (r = rt_cache_get_next(seq, r)))
 342                         --pos;
 343         return pos ? NULL : r;
 344 }
 345
 346 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 347 {
 348         struct rt_cache_iter_state *st = seq->private;
 349         if (*pos)
 350                 return rt_cache_get_idx(seq, *pos - 1);
 351         st->genid = rt_genid(seq_file_net(seq));
 352         return SEQ_START_TOKEN;
 353 }
 354
 355 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 356 {
 357         struct rtable *r;
 358
 359         if (v == SEQ_START_TOKEN)
 360                 r = rt_cache_get_first(seq);
 361         else
 362                 r = rt_cache_get_next(seq, v);
 363         ++*pos;
 364         return r;
 365 }
 366
 367 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 368 {
 369         if (v && v != SEQ_START_TOKEN)
 370                 rcu_read_unlock_bh();
 371 }
 372
 373 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 374 {
 375         if (v == SEQ_START_TOKEN)
 376                 seq_printf(seq, "%-127s\n",
 377                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 378                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 379                            "HHUptod\tSpecDst");
 380         else {
 381                 struct rtable *r = v;
 382                 int len;
 383
 384                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 385                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 386                         r->dst.dev ? r->dst.dev->name : "*",
 387                         (__force u32)r->rt_dst,
 388                         (__force u32)r->rt_gateway,
 389                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 390                         r->dst.__use, 0, (__force u32)r->rt_src,
 391                         dst_metric_advmss(&r->dst) + 40,
 392                         dst_metric(&r->dst, RTAX_WINDOW),
 393                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 394                               dst_metric(&r->dst, RTAX_RTTVAR)),
 395                         r->fl.fl4_tos,
 396                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 397                         r->dst.hh ? (r->dst.hh->hh_output ==
 398                                        dev_queue_xmit) : 0,
 399                         r->rt_spec_dst, &len);
 400
 401                 seq_printf(seq, "%*s\n", 127 - len, "");
 402         }
 403         return 0;
 404 }
 405
 406 static const struct seq_operations rt_cache_seq_ops = {
 407         .start  = rt_cache_seq_start,
 408         .next   = rt_cache_seq_next,
 409         .stop   = rt_cache_seq_stop,
 410         .show   = rt_cache_seq_show,
 411 };
 412
 413 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 414 {
 415         return seq_open_net(inode, file, &rt_cache_seq_ops,
 416                         sizeof(struct rt_cache_iter_state));
 417 }
 418
 419 static const struct file_operations rt_cache_seq_fops = {
 420         .owner   = THIS_MODULE,
 421         .open    = rt_cache_seq_open,
 422         .read    = seq_read,
 423         .llseek  = seq_lseek,
 424         .release = seq_release_net,
 425 };
 426
 427
 428 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 429 {
 430         int cpu;
 431
 432         if (*pos == 0)
 433                 return SEQ_START_TOKEN;
 434
 435         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 436                 if (!cpu_possible(cpu))
 437                         continue;
 438                 *pos = cpu+1;
 439                 return &per_cpu(rt_cache_stat, cpu);
 440         }
 441         return NULL;
 442 }
 443
 444 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 445 {
 446         int cpu;
 447
 448         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 449                 if (!cpu_possible(cpu))
 450                         continue;
 451                 *pos = cpu+1;
 452                 return &per_cpu(rt_cache_stat, cpu);
 453         }
 454         return NULL;
 455
 456 }
 457
 458 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 459 {
 460
 461 }
 462
 463 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 464 {
 465         struct rt_cache_stat *st = v;
 466
 467         if (v == SEQ_START_TOKEN) {
 468                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 469                 return 0;
 470         }
 471
 472         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 473                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 474                    dst_entries_get_slow(&ipv4_dst_ops),
 475                    st->in_hit,
 476                    st->in_slow_tot,
 477                    st->in_slow_mc,
 478                    st->in_no_route,
 479                    st->in_brd,
 480                    st->in_martian_dst,
 481                    st->in_martian_src,
 482
 483                    st->out_hit,
 484                    st->out_slow_tot,
 485                    st->out_slow_mc,
 486
 487                    st->gc_total,
 488                    st->gc_ignored,
 489                    st->gc_goal_miss,
 490                    st->gc_dst_overflow,
 491                    st->in_hlist_search,
 492                    st->out_hlist_search
 493                 );
 494         return 0;
 495 }
 496
 497 static const struct seq_operations rt_cpu_seq_ops = {
 498         .start  = rt_cpu_seq_start,
 499         .next   = rt_cpu_seq_next,
 500         .stop   = rt_cpu_seq_stop,
 501         .show   = rt_cpu_seq_show,
 502 };
 503
 504
 505 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 506 {
 507         return seq_open(file, &rt_cpu_seq_ops);
 508 }
 509
 510 static const struct file_operations rt_cpu_seq_fops = {
 511         .owner   = THIS_MODULE,
 512         .open    = rt_cpu_seq_open,
 513         .read    = seq_read,
 514         .llseek  = seq_lseek,
 515         .release = seq_release,
 516 };
 517
 518 #ifdef CONFIG_NET_CLS_ROUTE
 519 static int rt_acct_proc_show(struct seq_file *m, void *v)
 520 {
 521         struct ip_rt_acct *dst, *src;
 522         unsigned int i, j;
 523
 524         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 525         if (!dst)
 526                 return -ENOMEM;
 527
 528         for_each_possible_cpu(i) {
 529                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 530                 for (j = 0; j < 256; j++) {
 531                         dst[j].o_bytes   += src[j].o_bytes;
 532                         dst[j].o_packets += src[j].o_packets;
 533                         dst[j].i_bytes   += src[j].i_bytes;
 534                         dst[j].i_packets += src[j].i_packets;
 535                 }
 536         }
 537
 538         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 539         kfree(dst);
 540         return 0;
 541 }
 542
 543 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 544 {
 545         return single_open(file, rt_acct_proc_show, NULL);
 546 }
 547
 548 static const struct file_operations rt_acct_proc_fops = {
 549         .owner          = THIS_MODULE,
 550         .open           = rt_acct_proc_open,
 551         .read           = seq_read,
 552         .llseek         = seq_lseek,
 553         .release        = single_release,
 554 };
 555 #endif
 556
 557 static int __net_init ip_rt_do_proc_init(struct net *net)
 558 {
 559         struct proc_dir_entry *pde;
 560
 561         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 562                         &rt_cache_seq_fops);
 563         if (!pde)
 564                 goto err1;
 565
 566         pde = proc_create("rt_cache", S_IRUGO,
 567                           net->proc_net_stat, &rt_cpu_seq_fops);
 568         if (!pde)
 569                 goto err2;
 570
 571 #ifdef CONFIG_NET_CLS_ROUTE
 572         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 573         if (!pde)
 574                 goto err3;
 575 #endif
 576         return 0;
 577
 578 #ifdef CONFIG_NET_CLS_ROUTE
 579 err3:
 580         remove_proc_entry("rt_cache", net->proc_net_stat);
 581 #endif
 582 err2:
 583         remove_proc_entry("rt_cache", net->proc_net);
 584 err1:
 585         return -ENOMEM;
 586 }
 587
 588 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 589 {
 590         remove_proc_entry("rt_cache", net->proc_net_stat);
 591         remove_proc_entry("rt_cache", net->proc_net);
 592 #ifdef CONFIG_NET_CLS_ROUTE
 593         remove_proc_entry("rt_acct", net->proc_net);
 594 #endif
 595 }
 596
 597 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 598         .init = ip_rt_do_proc_init,
 599         .exit = ip_rt_do_proc_exit,
 600 };
 601
 602 static int __init ip_rt_proc_init(void)
 603 {
 604         return register_pernet_subsys(&ip_rt_proc_ops);
 605 }
 606
 607 #else
 608 static inline int ip_rt_proc_init(void)
 609 {
 610         return 0;
 611 }
 612 #endif /* CONFIG_PROC_FS */
 613
 614 static inline void rt_free(struct rtable *rt)
 615 {
 616         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 617 }
 618
 619 static inline void rt_drop(struct rtable *rt)
 620 {
 621         ip_rt_put(rt);
 622         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 623 }
 624
 625 static inline int rt_fast_clean(struct rtable *rth)
 626 {
 627         /* Kill broadcast/multicast entries very aggresively, if they
 628            collide in hash table with more useful entries */
 629         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 630                 rt_is_input_route(rth) && rth->dst.rt_next;
 631 }
 632
 633 static inline int rt_valuable(struct rtable *rth)
 634 {
 635         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 636                 rth->dst.expires;
 637 }
 638
 639 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 640 {
 641         unsigned long age;
 642         int ret = 0;
 643
 644         if (atomic_read(&rth->dst.__refcnt))
 645                 goto out;
 646
 647         ret = 1;
 648         if (rth->dst.expires &&
 649             time_after_eq(jiffies, rth->dst.expires))
 650                 goto out;
 651
 652         age = jiffies - rth->dst.lastuse;
 653         ret = 0;
 654         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 655             (age <= tmo2 && rt_valuable(rth)))
 656                 goto out;
 657         ret = 1;
 658 out:    return ret;
 659 }
 660
 661 /* Bits of score are:
 662  * 31: very valuable
 663  * 30: not quite useless
 664  * 29..0: usage counter
 665  */
 666 static inline u32 rt_score(struct rtable *rt)
 667 {
 668         u32 score = jiffies - rt->dst.lastuse;
 669
 670         score = ~score & ~(3<<30);
 671
 672         if (rt_valuable(rt))
 673                 score |= (1<<31);
 674
 675         if (rt_is_output_route(rt) ||
 676             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 677                 score |= (1<<30);
 678
 679         return score;
 680 }
 681
 682 static inline bool rt_caching(const struct net *net)
 683 {
 684         return net->ipv4.current_rt_cache_rebuild_count <=
 685                 net->ipv4.sysctl_rt_cache_rebuild_count;
 686 }
 687
 688 static inline bool compare_hash_inputs(const struct flowi *fl1,
 689                                         const struct flowi *fl2)
 690 {
 691         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 692                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 693                 (fl1->iif ^ fl2->iif)) == 0);
 694 }
 695
 696 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 697 {
 698         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 699                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 700                 (fl1->mark ^ fl2->mark) |
 701                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
 702                 (fl1->oif ^ fl2->oif) |
 703                 (fl1->iif ^ fl2->iif)) == 0;
 704 }
 705
 706 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 707 {
 708         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 709 }
 710
 711 static inline int rt_is_expired(struct rtable *rth)
 712 {
 713         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 714 }
 715
 716 /*
 717  * Perform a full scan of hash table and free all entries.
 718  * Can be called by a softirq or a process.
 719  * In the later case, we want to be reschedule if necessary
 720  */
 721 static void rt_do_flush(struct net *net, int process_context)
 722 {
 723         unsigned int i;
 724         struct rtable *rth, *next;
 725
 726         for (i = 0; i <= rt_hash_mask; i++) {
 727                 struct rtable __rcu **pprev;
 728                 struct rtable *list;
 729
 730                 if (process_context && need_resched())
 731                         cond_resched();
 732                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 733                 if (!rth)
 734                         continue;
 735
 736                 spin_lock_bh(rt_hash_lock_addr(i));
 737
 738                 list = NULL;
 739                 pprev = &rt_hash_table[i].chain;
 740                 rth = rcu_dereference_protected(*pprev,
 741                         lockdep_is_held(rt_hash_lock_addr(i)));
 742
 743                 while (rth) {
 744                         next = rcu_dereference_protected(rth->dst.rt_next,
 745                                 lockdep_is_held(rt_hash_lock_addr(i)));
 746
 747                         if (!net ||
 748                             net_eq(dev_net(rth->dst.dev), net)) {
 749                                 rcu_assign_pointer(*pprev, next);
 750                                 rcu_assign_pointer(rth->dst.rt_next, list);
 751                                 list = rth;
 752                         } else {
 753                                 pprev = &rth->dst.rt_next;
 754                         }
 755                         rth = next;
 756                 }
 757
 758                 spin_unlock_bh(rt_hash_lock_addr(i));
 759
 760                 for (; list; list = next) {
 761                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 762                         rt_free(list);
 763                 }
 764         }
 765 }
 766
 767 /*
 768  * While freeing expired entries, we compute average chain length
 769  * and standard deviation, using fixed-point arithmetic.
 770  * This to have an estimation of rt_chain_length_max
 771  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 772  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 773  */
 774
 775 #define FRACT_BITS 3
 776 #define ONE (1UL << FRACT_BITS)
 777
 778 /*
 779  * Given a hash chain and an item in this hash chain,
 780  * find if a previous entry has the same hash_inputs
 781  * (but differs on tos, mark or oif)
 782  * Returns 0 if an alias is found.
 783  * Returns ONE if rth has no alias before itself.
 784  */
 785 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 786 {
 787         const struct rtable *aux = head;
 788
 789         while (aux != rth) {
 790                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 791                         return 0;
 792                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 793         }
 794         return ONE;
 795 }
 796
 797 static void rt_check_expire(void)
 798 {
 799         static unsigned int rover;
 800         unsigned int i = rover, goal;
 801         struct rtable *rth;
 802         struct rtable __rcu **rthp;
 803         unsigned long samples = 0;
 804         unsigned long sum = 0, sum2 = 0;
 805         unsigned long delta;
 806         u64 mult;
 807
 808         delta = jiffies - expires_ljiffies;
 809         expires_ljiffies = jiffies;
 810         mult = ((u64)delta) << rt_hash_log;
 811         if (ip_rt_gc_timeout > 1)
 812                 do_div(mult, ip_rt_gc_timeout);
 813         goal = (unsigned int)mult;
 814         if (goal > rt_hash_mask)
 815                 goal = rt_hash_mask + 1;
 816         for (; goal > 0; goal--) {
 817                 unsigned long tmo = ip_rt_gc_timeout;
 818                 unsigned long length;
 819
 820                 i = (i + 1) & rt_hash_mask;
 821                 rthp = &rt_hash_table[i].chain;
 822
 823                 if (need_resched())
 824                         cond_resched();
 825
 826                 samples++;
 827
 828                 if (rcu_dereference_raw(*rthp) == NULL)
 829                         continue;
 830                 length = 0;
 831                 spin_lock_bh(rt_hash_lock_addr(i));
 832                 while ((rth = rcu_dereference_protected(*rthp,
 833                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 834                         prefetch(rth->dst.rt_next);
 835                         if (rt_is_expired(rth)) {
 836                                 *rthp = rth->dst.rt_next;
 837                                 rt_free(rth);
 838                                 continue;
 839                         }
 840                         if (rth->dst.expires) {
 841                                 /* Entry is expired even if it is in use */
 842                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 843 nofree:
 844                                         tmo >>= 1;
 845                                         rthp = &rth->dst.rt_next;
 846                                         /*
 847                                          * We only count entries on
 848                                          * a chain with equal hash inputs once
 849                                          * so that entries for different QOS
 850                                          * levels, and other non-hash input
 851                                          * attributes don't unfairly skew
 852                                          * the length computation
 853                                          */
 854                                         length += has_noalias(rt_hash_table[i].chain, rth);
 855                                         continue;
 856                                 }
 857                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 858                                 goto nofree;
 859
 860                         /* Cleanup aged off entries. */
 861                         *rthp = rth->dst.rt_next;
 862                         rt_free(rth);
 863                 }
 864                 spin_unlock_bh(rt_hash_lock_addr(i));
 865                 sum += length;
 866                 sum2 += length*length;
 867         }
 868         if (samples) {
 869                 unsigned long avg = sum / samples;
 870                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 871                 rt_chain_length_max = max_t(unsigned long,
 872                                         ip_rt_gc_elasticity,
 873                                         (avg + 4*sd) >> FRACT_BITS);
 874         }
 875         rover = i;
 876 }
 877
 878 /*
 879  * rt_worker_func() is run in process context.
 880  * we call rt_check_expire() to scan part of the hash table
 881  */
 882 static void rt_worker_func(struct work_struct *work)
 883 {
 884         rt_check_expire();
 885         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 886 }
 887
 888 /*
 889  * Pertubation of rt_genid by a small quantity [1..256]
 890  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 891  * many times (2^24) without giving recent rt_genid.
 892  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 893  */
 894 static void rt_cache_invalidate(struct net *net)
 895 {
 896         unsigned char shuffle;
 897
 898         get_random_bytes(&shuffle, sizeof(shuffle));
 899         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 900 }
 901
 902 /*
 903  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 904  * delay >= 0 : invalidate & flush cache (can be long)
 905  */
 906 void rt_cache_flush(struct net *net, int delay)
 907 {
 908         rt_cache_invalidate(net);
 909         if (delay >= 0)
 910                 rt_do_flush(net, !in_softirq());
 911 }
 912
 913 /* Flush previous cache invalidated entries from the cache */
 914 void rt_cache_flush_batch(struct net *net)
 915 {
 916         rt_do_flush(net, !in_softirq());
 917 }
 918
 919 static void rt_emergency_hash_rebuild(struct net *net)
 920 {
 921         if (net_ratelimit())
 922                 printk(KERN_WARNING "Route hash chain too long!\n");
 923         rt_cache_invalidate(net);
 924 }
 925
 926 /*
 927    Short description of GC goals.
 928
 929    We want to build algorithm, which will keep routing cache
 930    at some equilibrium point, when number of aged off entries
 931    is kept approximately equal to newly generated ones.
 932
 933    Current expiration strength is variable "expire".
 934    We try to adjust it dynamically, so that if networking
 935    is idle expires is large enough to keep enough of warm entries,
 936    and when load increases it reduces to limit cache size.
 937  */
 938
 939 static int rt_garbage_collect(struct dst_ops *ops)
 940 {
 941         static unsigned long expire = RT_GC_TIMEOUT;
 942         static unsigned long last_gc;
 943         static int rover;
 944         static int equilibrium;
 945         struct rtable *rth;
 946         struct rtable __rcu **rthp;
 947         unsigned long now = jiffies;
 948         int goal;
 949         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 950
 951         /*
 952          * Garbage collection is pretty expensive,
 953          * do not make it too frequently.
 954          */
 955
 956         RT_CACHE_STAT_INC(gc_total);
 957
 958         if (now - last_gc < ip_rt_gc_min_interval &&
 959             entries < ip_rt_max_size) {
 960                 RT_CACHE_STAT_INC(gc_ignored);
 961                 goto out;
 962         }
 963
 964         entries = dst_entries_get_slow(&ipv4_dst_ops);
 965         /* Calculate number of entries, which we want to expire now. */
 966         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 967         if (goal <= 0) {
 968                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 969                         equilibrium = ipv4_dst_ops.gc_thresh;
 970                 goal = entries - equilibrium;
 971                 if (goal > 0) {
 972                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 973                         goal = entries - equilibrium;
 974                 }
 975         } else {
 976                 /* We are in dangerous area. Try to reduce cache really
 977                  * aggressively.
 978                  */
 979                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 980                 equilibrium = entries - goal;
 981         }
 982
 983         if (now - last_gc >= ip_rt_gc_min_interval)
 984                 last_gc = now;
 985
 986         if (goal <= 0) {
 987                 equilibrium += goal;
 988                 goto work_done;
 989         }
 990
 991         do {
 992                 int i, k;
 993
 994                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 995                         unsigned long tmo = expire;
 996
 997                         k = (k + 1) & rt_hash_mask;
 998                         rthp = &rt_hash_table[k].chain;
 999                         spin_lock_bh(rt_hash_lock_addr(k));
1000                         while ((rth = rcu_dereference_protected(*rthp,
1001                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1002                                 if (!rt_is_expired(rth) &&
1003                                         !rt_may_expire(rth, tmo, expire)) {
1004                                         tmo >>= 1;
1005                                         rthp = &rth->dst.rt_next;
1006                                         continue;
1007                                 }
1008                                 *rthp = rth->dst.rt_next;
1009                                 rt_free(rth);
1010                                 goal--;
1011                         }
1012                         spin_unlock_bh(rt_hash_lock_addr(k));
1013                         if (goal <= 0)
1014                                 break;
1015                 }
1016                 rover = k;
1017
1018                 if (goal <= 0)
1019                         goto work_done;
1020
1021                 /* Goal is not achieved. We stop process if:
1022
1023                    - if expire reduced to zero. Otherwise, expire is halfed.
1024                    - if table is not full.
1025                    - if we are called from interrupt.
1026                    - jiffies check is just fallback/debug loop breaker.
1027                      We will not spin here for long time in any case.
1028                  */
1029
1030                 RT_CACHE_STAT_INC(gc_goal_miss);
1031
1032                 if (expire == 0)
1033                         break;
1034
1035                 expire >>= 1;
1036 #if RT_CACHE_DEBUG >= 2
1037                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1038                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1039 #endif
1040
1041                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1042                         goto out;
1043         } while (!in_softirq() && time_before_eq(jiffies, now));
1044
1045         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1046                 goto out;
1047         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1048                 goto out;
1049         if (net_ratelimit())
1050                 printk(KERN_WARNING "dst cache overflow\n");
1051         RT_CACHE_STAT_INC(gc_dst_overflow);
1052         return 1;
1053
1054 work_done:
1055         expire += ip_rt_gc_min_interval;
1056         if (expire > ip_rt_gc_timeout ||
1057             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1058             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1059                 expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1063 #endif
1064 out:    return 0;
1065 }
1066
1067 /*
1068  * Returns number of entries in a hash chain that have different hash_inputs
1069  */
1070 static int slow_chain_length(const struct rtable *head)
1071 {
1072         int length = 0;
1073         const struct rtable *rth = head;
1074
1075         while (rth) {
1076                 length += has_noalias(head, rth);
1077                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1078         }
1079         return length >> FRACT_BITS;
1080 }
1081
1082 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1083                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1084 {
1085         struct rtable   *rth, *cand;
1086         struct rtable __rcu **rthp, **candp;
1087         unsigned long   now;
1088         u32             min_score;
1089         int             chain_length;
1090         int attempts = !in_softirq();
1091
1092 restart:
1093         chain_length = 0;
1094         min_score = ~(u32)0;
1095         cand = NULL;
1096         candp = NULL;
1097         now = jiffies;
1098
1099         if (!rt_caching(dev_net(rt->dst.dev))) {
1100                 /*
1101                  * If we're not caching, just tell the caller we
1102                  * were successful and don't touch the route.  The
1103                  * caller hold the sole reference to the cache entry, and
1104                  * it will be released when the caller is done with it.
1105                  * If we drop it here, the callers have no way to resolve routes
1106                  * when we're not caching.  Instead, just point *rp at rt, so
1107                  * the caller gets a single use out of the route
1108                  * Note that we do rt_free on this new route entry, so that
1109                  * once its refcount hits zero, we are still able to reap it
1110                  * (Thanks Alexey)
1111                  * Note: To avoid expensive rcu stuff for this uncached dst,
1112                  * we set DST_NOCACHE so that dst_release() can free dst without
1113                  * waiting a grace period.
1114                  */
1115
1116                 rt->dst.flags |= DST_NOCACHE;
1117                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1118                         int err = arp_bind_neighbour(&rt->dst);
1119                         if (err) {
1120                                 if (net_ratelimit())
1121                                         printk(KERN_WARNING
1122                                             "Neighbour table failure & not caching routes.\n");
1123                                 ip_rt_put(rt);
1124                                 return err;
1125                         }
1126                 }
1127
1128                 goto skip_hashing;
1129         }
1130
1131         rthp = &rt_hash_table[hash].chain;
1132
1133         spin_lock_bh(rt_hash_lock_addr(hash));
1134         while ((rth = rcu_dereference_protected(*rthp,
1135                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1136                 if (rt_is_expired(rth)) {
1137                         *rthp = rth->dst.rt_next;
1138                         rt_free(rth);
1139                         continue;
1140                 }
1141                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1142                         /* Put it first */
1143                         *rthp = rth->dst.rt_next;
1144                         /*
1145                          * Since lookup is lockfree, the deletion
1146                          * must be visible to another weakly ordered CPU before
1147                          * the insertion at the start of the hash chain.
1148                          */
1149                         rcu_assign_pointer(rth->dst.rt_next,
1150                                            rt_hash_table[hash].chain);
1151                         /*
1152                          * Since lookup is lockfree, the update writes
1153                          * must be ordered for consistency on SMP.
1154                          */
1155                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1156
1157                         dst_use(&rth->dst, now);
1158                         spin_unlock_bh(rt_hash_lock_addr(hash));
1159
1160                         rt_drop(rt);
1161                         if (rp)
1162                                 *rp = rth;
1163                         else
1164                                 skb_dst_set(skb, &rth->dst);
1165                         return 0;
1166                 }
1167
1168                 if (!atomic_read(&rth->dst.__refcnt)) {
1169                         u32 score = rt_score(rth);
1170
1171                         if (score <= min_score) {
1172                                 cand = rth;
1173                                 candp = rthp;
1174                                 min_score = score;
1175                         }
1176                 }
1177
1178                 chain_length++;
1179
1180                 rthp = &rth->dst.rt_next;
1181         }
1182
1183         if (cand) {
1184                 /* ip_rt_gc_elasticity used to be average length of chain
1185                  * length, when exceeded gc becomes really aggressive.
1186                  *
1187                  * The second limit is less certain. At the moment it allows
1188                  * only 2 entries per bucket. We will see.
1189                  */
1190                 if (chain_length > ip_rt_gc_elasticity) {
1191                         *candp = cand->dst.rt_next;
1192                         rt_free(cand);
1193                 }
1194         } else {
1195                 if (chain_length > rt_chain_length_max &&
1196                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1197                         struct net *net = dev_net(rt->dst.dev);
1198                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1199                         if (!rt_caching(net)) {
1200                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1201                                         rt->dst.dev->name, num);
1202                         }
1203                         rt_emergency_hash_rebuild(net);
1204                         spin_unlock_bh(rt_hash_lock_addr(hash));
1205
1206                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1207                                         ifindex, rt_genid(net));
1208                         goto restart;
1209                 }
1210         }
1211
1212         /* Try to bind route to arp only if it is output
1213            route or unicast forwarding path.
1214          */
1215         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1216                 int err = arp_bind_neighbour(&rt->dst);
1217                 if (err) {
1218                         spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220                         if (err != -ENOBUFS) {
1221                                 rt_drop(rt);
1222                                 return err;
1223                         }
1224
1225                         /* Neighbour tables are full and nothing
1226                            can be released. Try to shrink route cache,
1227                            it is most likely it holds some neighbour records.
1228                          */
1229                         if (attempts-- > 0) {
1230                                 int saved_elasticity = ip_rt_gc_elasticity;
1231                                 int saved_int = ip_rt_gc_min_interval;
1232                                 ip_rt_gc_elasticity     = 1;
1233                                 ip_rt_gc_min_interval   = 0;
1234                                 rt_garbage_collect(&ipv4_dst_ops);
1235                                 ip_rt_gc_min_interval   = saved_int;
1236                                 ip_rt_gc_elasticity     = saved_elasticity;
1237                                 goto restart;
1238                         }
1239
1240                         if (net_ratelimit())
1241                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1242                         rt_drop(rt);
1243                         return -ENOBUFS;
1244                 }
1245         }
1246
1247         rt->dst.rt_next = rt_hash_table[hash].chain;
1248
1249 #if RT_CACHE_DEBUG >= 2
1250         if (rt->dst.rt_next) {
1251                 struct rtable *trt;
1252                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1253                        hash, &rt->rt_dst);
1254                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1255                         printk(" . %pI4", &trt->rt_dst);
1256                 printk("\n");
1257         }
1258 #endif
1259         /*
1260          * Since lookup is lockfree, we must make sure
1261          * previous writes to rt are comitted to memory
1262          * before making rt visible to other CPUS.
1263          */
1264         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1265
1266         spin_unlock_bh(rt_hash_lock_addr(hash));
1267
1268 skip_hashing:
1269         if (rp)
1270                 *rp = rt;
1271         else
1272                 skb_dst_set(skb, &rt->dst);
1273         return 0;
1274 }
1275
1276 void rt_bind_peer(struct rtable *rt, int create)
1277 {
1278         struct inet_peer *peer;
1279
1280         peer = inet_getpeer_v4(rt->rt_dst, create);
1281
1282         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1283                 inet_putpeer(peer);
1284 }
1285
1286 /*
1287  * Peer allocation may fail only in serious out-of-memory conditions.  However
1288  * we still can generate some output.
1289  * Random ID selection looks a bit dangerous because we have no chances to
1290  * select ID being unique in a reasonable period of time.
1291  * But broken packet identifier may be better than no packet at all.
1292  */
1293 static void ip_select_fb_ident(struct iphdr *iph)
1294 {
1295         static DEFINE_SPINLOCK(ip_fb_id_lock);
1296         static u32 ip_fallback_id;
1297         u32 salt;
1298
1299         spin_lock_bh(&ip_fb_id_lock);
1300         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1301         iph->id = htons(salt & 0xFFFF);
1302         ip_fallback_id = salt;
1303         spin_unlock_bh(&ip_fb_id_lock);
1304 }
1305
1306 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1307 {
1308         struct rtable *rt = (struct rtable *) dst;
1309
1310         if (rt) {
1311                 if (rt->peer == NULL)
1312                         rt_bind_peer(rt, 1);
1313
1314                 /* If peer is attached to destination, it is never detached,
1315                    so that we need not to grab a lock to dereference it.
1316                  */
1317                 if (rt->peer) {
1318                         iph->id = htons(inet_getid(rt->peer, more));
1319                         return;
1320                 }
1321         } else
1322                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1323                        __builtin_return_address(0));
1324
1325         ip_select_fb_ident(iph);
1326 }
1327 EXPORT_SYMBOL(__ip_select_ident);
1328
1329 static void rt_del(unsigned hash, struct rtable *rt)
1330 {
1331         struct rtable __rcu **rthp;
1332         struct rtable *aux;
1333
1334         rthp = &rt_hash_table[hash].chain;
1335         spin_lock_bh(rt_hash_lock_addr(hash));
1336         ip_rt_put(rt);
1337         while ((aux = rcu_dereference_protected(*rthp,
1338                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1339                 if (aux == rt || rt_is_expired(aux)) {
1340                         *rthp = aux->dst.rt_next;
1341                         rt_free(aux);
1342                         continue;
1343                 }
1344                 rthp = &aux->dst.rt_next;
1345         }
1346         spin_unlock_bh(rt_hash_lock_addr(hash));
1347 }
1348
1349 /* called in rcu_read_lock() section */
1350 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1351                     __be32 saddr, struct net_device *dev)
1352 {
1353         int i, k;
1354         struct in_device *in_dev = __in_dev_get_rcu(dev);
1355         struct rtable *rth;
1356         struct rtable __rcu **rthp;
1357         __be32  skeys[2] = { saddr, 0 };
1358         int  ikeys[2] = { dev->ifindex, 0 };
1359         struct netevent_redirect netevent;
1360         struct net *net;
1361
1362         if (!in_dev)
1363                 return;
1364
1365         net = dev_net(dev);
1366         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1367             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1368             ipv4_is_zeronet(new_gw))
1369                 goto reject_redirect;
1370
1371         if (!rt_caching(net))
1372                 goto reject_redirect;
1373
1374         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1375                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1376                         goto reject_redirect;
1377                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1378                         goto reject_redirect;
1379         } else {
1380                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1381                         goto reject_redirect;
1382         }
1383
1384         for (i = 0; i < 2; i++) {
1385                 for (k = 0; k < 2; k++) {
1386                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1387                                                 rt_genid(net));
1388
1389                         rthp = &rt_hash_table[hash].chain;
1390
1391                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1392                                 struct rtable *rt;
1393
1394                                 if (rth->fl.fl4_dst != daddr ||
1395                                     rth->fl.fl4_src != skeys[i] ||
1396                                     rth->fl.oif != ikeys[k] ||
1397                                     rt_is_input_route(rth) ||
1398                                     rt_is_expired(rth) ||
1399                                     !net_eq(dev_net(rth->dst.dev), net)) {
1400                                         rthp = &rth->dst.rt_next;
1401                                         continue;
1402                                 }
1403
1404                                 if (rth->rt_dst != daddr ||
1405                                     rth->rt_src != saddr ||
1406                                     rth->dst.error ||
1407                                     rth->rt_gateway != old_gw ||
1408                                     rth->dst.dev != dev)
1409                                         break;
1410
1411                                 dst_hold(&rth->dst);
1412
1413                                 rt = dst_alloc(&ipv4_dst_ops);
1414                                 if (rt == NULL) {
1415                                         ip_rt_put(rth);
1416                                         return;
1417                                 }
1418
1419                                 /* Copy all the information. */
1420                                 *rt = *rth;
1421                                 rt->dst.__use           = 1;
1422                                 atomic_set(&rt->dst.__refcnt, 1);
1423                                 rt->dst.child           = NULL;
1424                                 if (rt->dst.dev)
1425                                         dev_hold(rt->dst.dev);
1426                                 rt->dst.obsolete        = -1;
1427                                 rt->dst.lastuse = jiffies;
1428                                 rt->dst.path            = &rt->dst;
1429                                 rt->dst.neighbour       = NULL;
1430                                 rt->dst.hh              = NULL;
1431 #ifdef CONFIG_XFRM
1432                                 rt->dst.xfrm            = NULL;
1433 #endif
1434                                 rt->rt_genid            = rt_genid(net);
1435                                 rt->rt_flags            |= RTCF_REDIRECTED;
1436
1437                                 /* Gateway is different ... */
1438                                 rt->rt_gateway          = new_gw;
1439
1440                                 /* Redirect received -> path was valid */
1441                                 dst_confirm(&rth->dst);
1442
1443                                 if (rt->peer)
1444                                         atomic_inc(&rt->peer->refcnt);
1445
1446                                 if (arp_bind_neighbour(&rt->dst) ||
1447                                     !(rt->dst.neighbour->nud_state &
1448                                             NUD_VALID)) {
1449                                         if (rt->dst.neighbour)
1450                                                 neigh_event_send(rt->dst.neighbour, NULL);
1451                                         ip_rt_put(rth);
1452                                         rt_drop(rt);
1453                                         goto do_next;
1454                                 }
1455
1456                                 netevent.old = &rth->dst;
1457                                 netevent.new = &rt->dst;
1458                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1459                                                         &netevent);
1460
1461                                 rt_del(hash, rth);
1462                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1463                                         ip_rt_put(rt);
1464                                 goto do_next;
1465                         }
1466                 do_next:
1467                         ;
1468                 }
1469         }
1470         return;
1471
1472 reject_redirect:
1473 #ifdef CONFIG_IP_ROUTE_VERBOSE
1474         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1475                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1476                         "  Advised path = %pI4 -> %pI4\n",
1477                        &old_gw, dev->name, &new_gw,
1478                        &saddr, &daddr);
1479 #endif
1480         ;
1481 }
1482
1483 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1484 {
1485         struct rtable *rt = (struct rtable *)dst;
1486         struct dst_entry *ret = dst;
1487
1488         if (rt) {
1489                 if (dst->obsolete > 0) {
1490                         ip_rt_put(rt);
1491                         ret = NULL;
1492                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1493                            (rt->dst.expires &&
1494                             time_after_eq(jiffies, rt->dst.expires))) {
1495                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1496                                                 rt->fl.oif,
1497                                                 rt_genid(dev_net(dst->dev)));
1498 #if RT_CACHE_DEBUG >= 1
1499                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1500                                 &rt->rt_dst, rt->fl.fl4_tos);
1501 #endif
1502                         rt_del(hash, rt);
1503                         ret = NULL;
1504                 }
1505         }
1506         return ret;
1507 }
1508
1509 /*
1510  * Algorithm:
1511  *      1. The first ip_rt_redirect_number redirects are sent
1512  *         with exponential backoff, then we stop sending them at all,
1513  *         assuming that the host ignores our redirects.
1514  *      2. If we did not see packets requiring redirects
1515  *         during ip_rt_redirect_silence, we assume that the host
1516  *         forgot redirected route and start to send redirects again.
1517  *
1518  * This algorithm is much cheaper and more intelligent than dumb load limiting
1519  * in icmp.c.
1520  *
1521  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1522  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1523  */
1524
1525 void ip_rt_send_redirect(struct sk_buff *skb)
1526 {
1527         struct rtable *rt = skb_rtable(skb);
1528         struct in_device *in_dev;
1529         int log_martians;
1530
1531         rcu_read_lock();
1532         in_dev = __in_dev_get_rcu(rt->dst.dev);
1533         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1534                 rcu_read_unlock();
1535                 return;
1536         }
1537         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1538         rcu_read_unlock();
1539
1540         /* No redirected packets during ip_rt_redirect_silence;
1541          * reset the algorithm.
1542          */
1543         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1544                 rt->dst.rate_tokens = 0;
1545
1546         /* Too many ignored redirects; do not send anything
1547          * set dst.rate_last to the last seen redirected packet.
1548          */
1549         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1550                 rt->dst.rate_last = jiffies;
1551                 return;
1552         }
1553
1554         /* Check for load limit; set rate_last to the latest sent
1555          * redirect.
1556          */
1557         if (rt->dst.rate_tokens == 0 ||
1558             time_after(jiffies,
1559                        (rt->dst.rate_last +
1560                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1561                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1562                 rt->dst.rate_last = jiffies;
1563                 ++rt->dst.rate_tokens;
1564 #ifdef CONFIG_IP_ROUTE_VERBOSE
1565                 if (log_martians &&
1566                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1567                     net_ratelimit())
1568                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1569                                 &rt->rt_src, rt->rt_iif,
1570                                 &rt->rt_dst, &rt->rt_gateway);
1571 #endif
1572         }
1573 }
1574
1575 static int ip_error(struct sk_buff *skb)
1576 {
1577         struct rtable *rt = skb_rtable(skb);
1578         unsigned long now;
1579         int code;
1580
1581         switch (rt->dst.error) {
1582                 case EINVAL:
1583                 default:
1584                         goto out;
1585                 case EHOSTUNREACH:
1586                         code = ICMP_HOST_UNREACH;
1587                         break;
1588                 case ENETUNREACH:
1589                         code = ICMP_NET_UNREACH;
1590                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1591                                         IPSTATS_MIB_INNOROUTES);
1592                         break;
1593                 case EACCES:
1594                         code = ICMP_PKT_FILTERED;
1595                         break;
1596         }
1597
1598         now = jiffies;
1599         rt->dst.rate_tokens += now - rt->dst.rate_last;
1600         if (rt->dst.rate_tokens > ip_rt_error_burst)
1601                 rt->dst.rate_tokens = ip_rt_error_burst;
1602         rt->dst.rate_last = now;
1603         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1604                 rt->dst.rate_tokens -= ip_rt_error_cost;
1605                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1606         }
1607
1608 out:    kfree_skb(skb);
1609         return 0;
1610 }
1611
1612 /*
1613  *      The last two values are not from the RFC but
1614  *      are needed for AMPRnet AX.25 paths.
1615  */
1616
1617 static const unsigned short mtu_plateau[] =
1618 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1619
1620 static inline unsigned short guess_mtu(unsigned short old_mtu)
1621 {
1622         int i;
1623
1624         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1625                 if (old_mtu > mtu_plateau[i])
1626                         return mtu_plateau[i];
1627         return 68;
1628 }
1629
1630 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1631                                  unsigned short new_mtu,
1632                                  struct net_device *dev)
1633 {
1634         int i, k;
1635         unsigned short old_mtu = ntohs(iph->tot_len);
1636         struct rtable *rth;
1637         int  ikeys[2] = { dev->ifindex, 0 };
1638         __be32  skeys[2] = { iph->saddr, 0, };
1639         __be32  daddr = iph->daddr;
1640         unsigned short est_mtu = 0;
1641
1642         for (k = 0; k < 2; k++) {
1643                 for (i = 0; i < 2; i++) {
1644                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1645                                                 rt_genid(net));
1646
1647                         rcu_read_lock();
1648                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1649                              rth = rcu_dereference(rth->dst.rt_next)) {
1650                                 unsigned short mtu = new_mtu;
1651
1652                                 if (rth->fl.fl4_dst != daddr ||
1653                                     rth->fl.fl4_src != skeys[i] ||
1654                                     rth->rt_dst != daddr ||
1655                                     rth->rt_src != iph->saddr ||
1656                                     rth->fl.oif != ikeys[k] ||
1657                                     rt_is_input_route(rth) ||
1658                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1659                                     !net_eq(dev_net(rth->dst.dev), net) ||
1660                                     rt_is_expired(rth))
1661                                         continue;
1662
1663                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1664
1665                                         /* BSD 4.2 compatibility hack :-( */
1666                                         if (mtu == 0 &&
1667                                             old_mtu >= dst_mtu(&rth->dst) &&
1668                                             old_mtu >= 68 + (iph->ihl << 2))
1669                                                 old_mtu -= iph->ihl << 2;
1670
1671                                         mtu = guess_mtu(old_mtu);
1672                                 }
1673                                 if (mtu <= dst_mtu(&rth->dst)) {
1674                                         if (mtu < dst_mtu(&rth->dst)) {
1675                                                 dst_confirm(&rth->dst);
1676                                                 if (mtu < ip_rt_min_pmtu) {
1677                                                         u32 lock = dst_metric(&rth->dst,
1678                                                                               RTAX_LOCK);
1679                                                         mtu = ip_rt_min_pmtu;
1680                                                         lock |= (1 << RTAX_MTU);
1681                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1682                                                                        lock);
1683                                                 }
1684                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1685                                                 dst_set_expires(&rth->dst,
1686                                                         ip_rt_mtu_expires);
1687                                         }
1688                                         est_mtu = mtu;
1689                                 }
1690                         }
1691                         rcu_read_unlock();
1692                 }
1693         }
1694         return est_mtu ? : new_mtu;
1695 }
1696
1697 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1698 {
1699         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1700             !(dst_metric_locked(dst, RTAX_MTU))) {
1701                 if (mtu < ip_rt_min_pmtu) {
1702                         u32 lock = dst_metric(dst, RTAX_LOCK);
1703                         mtu = ip_rt_min_pmtu;
1704                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1705                 }
1706                 dst_metric_set(dst, RTAX_MTU, mtu);
1707                 dst_set_expires(dst, ip_rt_mtu_expires);
1708                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1709         }
1710 }
1711
1712 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1713 {
1714         if (rt_is_expired((struct rtable *)dst))
1715                 return NULL;
1716         return dst;
1717 }
1718
1719 static void ipv4_dst_destroy(struct dst_entry *dst)
1720 {
1721         struct rtable *rt = (struct rtable *) dst;
1722         struct inet_peer *peer = rt->peer;
1723
1724         if (peer) {
1725                 rt->peer = NULL;
1726                 inet_putpeer(peer);
1727         }
1728 }
1729
1730
1731 static void ipv4_link_failure(struct sk_buff *skb)
1732 {
1733         struct rtable *rt;
1734
1735         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1736
1737         rt = skb_rtable(skb);
1738         if (rt)
1739                 dst_set_expires(&rt->dst, 0);
1740 }
1741
1742 static int ip_rt_bug(struct sk_buff *skb)
1743 {
1744         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1745                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1746                 skb->dev ? skb->dev->name : "?");
1747         kfree_skb(skb);
1748         return 0;
1749 }
1750
1751 /*
1752    We do not cache source address of outgoing interface,
1753    because it is used only by IP RR, TS and SRR options,
1754    so that it out of fast path.
1755
1756    BTW remember: "addr" is allowed to be not aligned
1757    in IP options!
1758  */
1759
1760 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1761 {
1762         __be32 src;
1763         struct fib_result res;
1764
1765         if (rt_is_output_route(rt))
1766                 src = rt->rt_src;
1767         else {
1768                 rcu_read_lock();
1769                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1770                         src = FIB_RES_PREFSRC(res);
1771                 else
1772                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1773                                         RT_SCOPE_UNIVERSE);
1774                 rcu_read_unlock();
1775         }
1776         memcpy(addr, &src, 4);
1777 }
1778
1779 #ifdef CONFIG_NET_CLS_ROUTE
1780 static void set_class_tag(struct rtable *rt, u32 tag)
1781 {
1782         if (!(rt->dst.tclassid & 0xFFFF))
1783                 rt->dst.tclassid |= tag & 0xFFFF;
1784         if (!(rt->dst.tclassid & 0xFFFF0000))
1785                 rt->dst.tclassid |= tag & 0xFFFF0000;
1786 }
1787 #endif
1788
1789 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1790 {
1791         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1792
1793         if (advmss == 0) {
1794                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1795                                ip_rt_min_advmss);
1796                 if (advmss > 65535 - 40)
1797                         advmss = 65535 - 40;
1798         }
1799         return advmss;
1800 }
1801
1802 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1803 {
1804         unsigned int mtu = dst->dev->mtu;
1805
1806         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1807                 const struct rtable *rt = (const struct rtable *) dst;
1808
1809                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1810                         mtu = 576;
1811         }
1812
1813         if (mtu > IP_MAX_MTU)
1814                 mtu = IP_MAX_MTU;
1815
1816         return mtu;
1817 }
1818
1819 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1820 {
1821         struct dst_entry *dst = &rt->dst;
1822         struct fib_info *fi = res->fi;
1823
1824         if (fi) {
1825                 if (FIB_RES_GW(*res) &&
1826                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1827                         rt->rt_gateway = FIB_RES_GW(*res);
1828                 dst_import_metrics(dst, fi->fib_metrics);
1829 #ifdef CONFIG_NET_CLS_ROUTE
1830                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1831 #endif
1832         }
1833
1834         if (dst_mtu(dst) > IP_MAX_MTU)
1835                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1836         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1837                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1838
1839 #ifdef CONFIG_NET_CLS_ROUTE
1840 #ifdef CONFIG_IP_MULTIPLE_TABLES
1841         set_class_tag(rt, fib_rules_tclass(res));
1842 #endif
1843         set_class_tag(rt, itag);
1844 #endif
1845         rt->rt_type = res->type;
1846 }
1847
1848 /* called in rcu_read_lock() section */
1849 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1850                                 u8 tos, struct net_device *dev, int our)
1851 {
1852         unsigned int hash;
1853         struct rtable *rth;
1854         __be32 spec_dst;
1855         struct in_device *in_dev = __in_dev_get_rcu(dev);
1856         u32 itag = 0;
1857         int err;
1858
1859         /* Primary sanity checks. */
1860
1861         if (in_dev == NULL)
1862                 return -EINVAL;
1863
1864         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1865             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1866                 goto e_inval;
1867
1868         if (ipv4_is_zeronet(saddr)) {
1869                 if (!ipv4_is_local_multicast(daddr))
1870                         goto e_inval;
1871                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1872         } else {
1873                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1874                                           &itag, 0);
1875                 if (err < 0)
1876                         goto e_err;
1877         }
1878         rth = dst_alloc(&ipv4_dst_ops);
1879         if (!rth)
1880                 goto e_nobufs;
1881
1882         rth->dst.output = ip_rt_bug;
1883         rth->dst.obsolete = -1;
1884
1885         atomic_set(&rth->dst.__refcnt, 1);
1886         rth->dst.flags= DST_HOST;
1887         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1888                 rth->dst.flags |= DST_NOPOLICY;
1889         rth->fl.fl4_dst = daddr;
1890         rth->rt_dst     = daddr;
1891         rth->fl.fl4_tos = tos;
1892         rth->fl.mark    = skb->mark;
1893         rth->fl.fl4_src = saddr;
1894         rth->rt_src     = saddr;
1895 #ifdef CONFIG_NET_CLS_ROUTE
1896         rth->dst.tclassid = itag;
1897 #endif
1898         rth->rt_iif     =
1899         rth->fl.iif     = dev->ifindex;
1900         rth->dst.dev    = init_net.loopback_dev;
1901         dev_hold(rth->dst.dev);
1902         rth->fl.oif     = 0;
1903         rth->rt_gateway = daddr;
1904         rth->rt_spec_dst= spec_dst;
1905         rth->rt_genid   = rt_genid(dev_net(dev));
1906         rth->rt_flags   = RTCF_MULTICAST;
1907         rth->rt_type    = RTN_MULTICAST;
1908         if (our) {
1909                 rth->dst.input= ip_local_deliver;
1910                 rth->rt_flags |= RTCF_LOCAL;
1911         }
1912
1913 #ifdef CONFIG_IP_MROUTE
1914         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1915                 rth->dst.input = ip_mr_input;
1916 #endif
1917         RT_CACHE_STAT_INC(in_slow_mc);
1918
1919         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1920         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1921
1922 e_nobufs:
1923         return -ENOBUFS;
1924 e_inval:
1925         return -EINVAL;
1926 e_err:
1927         return err;
1928 }
1929
1930
1931 static void ip_handle_martian_source(struct net_device *dev,
1932                                      struct in_device *in_dev,
1933                                      struct sk_buff *skb,
1934                                      __be32 daddr,
1935                                      __be32 saddr)
1936 {
1937         RT_CACHE_STAT_INC(in_martian_src);
1938 #ifdef CONFIG_IP_ROUTE_VERBOSE
1939         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1940                 /*
1941                  *      RFC1812 recommendation, if source is martian,
1942                  *      the only hint is MAC header.
1943                  */
1944                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1945                         &daddr, &saddr, dev->name);
1946                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1947                         int i;
1948                         const unsigned char *p = skb_mac_header(skb);
1949                         printk(KERN_WARNING "ll header: ");
1950                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1951                                 printk("%02x", *p);
1952                                 if (i < (dev->hard_header_len - 1))
1953                                         printk(":");
1954                         }
1955                         printk("\n");
1956                 }
1957         }
1958 #endif
1959 }
1960
1961 /* called in rcu_read_lock() section */
1962 static int __mkroute_input(struct sk_buff *skb,
1963                            struct fib_result *res,
1964                            struct in_device *in_dev,
1965                            __be32 daddr, __be32 saddr, u32 tos,
1966                            struct rtable **result)
1967 {
1968         struct rtable *rth;
1969         int err;
1970         struct in_device *out_dev;
1971         unsigned int flags = 0;
1972         __be32 spec_dst;
1973         u32 itag;
1974
1975         /* get a working reference to the output device */
1976         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1977         if (out_dev == NULL) {
1978                 if (net_ratelimit())
1979                         printk(KERN_CRIT "Bug in ip_route_input" \
1980                                "_slow(). Please, report\n");
1981                 return -EINVAL;
1982         }
1983
1984
1985         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1986                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1987         if (err < 0) {
1988                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1989                                          saddr);
1990
1991                 goto cleanup;
1992         }
1993
1994         if (err)
1995                 flags |= RTCF_DIRECTSRC;
1996
1997         if (out_dev == in_dev && err &&
1998             (IN_DEV_SHARED_MEDIA(out_dev) ||
1999              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2000                 flags |= RTCF_DOREDIRECT;
2001
2002         if (skb->protocol != htons(ETH_P_IP)) {
2003                 /* Not IP (i.e. ARP). Do not create route, if it is
2004                  * invalid for proxy arp. DNAT routes are always valid.
2005                  *
2006                  * Proxy arp feature have been extended to allow, ARP
2007                  * replies back to the same interface, to support
2008                  * Private VLAN switch technologies. See arp.c.
2009                  */
2010                 if (out_dev == in_dev &&
2011                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2012                         err = -EINVAL;
2013                         goto cleanup;
2014                 }
2015         }
2016
2017
2018         rth = dst_alloc(&ipv4_dst_ops);
2019         if (!rth) {
2020                 err = -ENOBUFS;
2021                 goto cleanup;
2022         }
2023
2024         atomic_set(&rth->dst.__refcnt, 1);
2025         rth->dst.flags= DST_HOST;
2026         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2027                 rth->dst.flags |= DST_NOPOLICY;
2028         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2029                 rth->dst.flags |= DST_NOXFRM;
2030         rth->fl.fl4_dst = daddr;
2031         rth->rt_dst     = daddr;
2032         rth->fl.fl4_tos = tos;
2033         rth->fl.mark    = skb->mark;
2034         rth->fl.fl4_src = saddr;
2035         rth->rt_src     = saddr;
2036         rth->rt_gateway = daddr;
2037         rth->rt_iif     =
2038                 rth->fl.iif     = in_dev->dev->ifindex;
2039         rth->dst.dev    = (out_dev)->dev;
2040         dev_hold(rth->dst.dev);
2041         rth->fl.oif     = 0;
2042         rth->rt_spec_dst= spec_dst;
2043
2044         rth->dst.obsolete = -1;
2045         rth->dst.input = ip_forward;
2046         rth->dst.output = ip_output;
2047         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2048
2049         rt_set_nexthop(rth, res, itag);
2050
2051         rth->rt_flags = flags;
2052
2053         *result = rth;
2054         err = 0;
2055  cleanup:
2056         return err;
2057 }
2058
2059 static int ip_mkroute_input(struct sk_buff *skb,
2060                             struct fib_result *res,
2061                             const struct flowi *fl,
2062                             struct in_device *in_dev,
2063                             __be32 daddr, __be32 saddr, u32 tos)
2064 {
2065         struct rtable* rth = NULL;
2066         int err;
2067         unsigned hash;
2068
2069 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2070         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2071                 fib_select_multipath(fl, res);
2072 #endif
2073
2074         /* create a routing cache entry */
2075         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2076         if (err)
2077                 return err;
2078
2079         /* put it into the cache */
2080         hash = rt_hash(daddr, saddr, fl->iif,
2081                        rt_genid(dev_net(rth->dst.dev)));
2082         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2083 }
2084
2085 /*
2086  *      NOTE. We drop all the packets that has local source
2087  *      addresses, because every properly looped back packet
2088  *      must have correct destination already attached by output routine.
2089  *
2090  *      Such approach solves two big problems:
2091  *      1. Not simplex devices are handled properly.
2092  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2093  *      called with rcu_read_lock()
2094  */
2095
2096 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2097                                u8 tos, struct net_device *dev)
2098 {
2099         struct fib_result res;
2100         struct in_device *in_dev = __in_dev_get_rcu(dev);
2101         struct flowi fl = { .fl4_dst    = daddr,
2102                             .fl4_src    = saddr,
2103                             .fl4_tos    = tos,
2104                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2105                             .mark = skb->mark,
2106                             .iif = dev->ifindex };
2107         unsigned        flags = 0;
2108         u32             itag = 0;
2109         struct rtable * rth;
2110         unsigned        hash;
2111         __be32          spec_dst;
2112         int             err = -EINVAL;
2113         struct net    * net = dev_net(dev);
2114
2115         /* IP on this device is disabled. */
2116
2117         if (!in_dev)
2118                 goto out;
2119
2120         /* Check for the most weird martians, which can be not detected
2121            by fib_lookup.
2122          */
2123
2124         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2125             ipv4_is_loopback(saddr))
2126                 goto martian_source;
2127
2128         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2129                 goto brd_input;
2130
2131         /* Accept zero addresses only to limited broadcast;
2132          * I even do not know to fix it or not. Waiting for complains :-)
2133          */
2134         if (ipv4_is_zeronet(saddr))
2135                 goto martian_source;
2136
2137         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2138                 goto martian_destination;
2139
2140         /*
2141          *      Now we are ready to route packet.
2142          */
2143         err = fib_lookup(net, &fl, &res);
2144         if (err != 0) {
2145                 if (!IN_DEV_FORWARD(in_dev))
2146                         goto e_hostunreach;
2147                 goto no_route;
2148         }
2149
2150         RT_CACHE_STAT_INC(in_slow_tot);
2151
2152         if (res.type == RTN_BROADCAST)
2153                 goto brd_input;
2154
2155         if (res.type == RTN_LOCAL) {
2156                 err = fib_validate_source(saddr, daddr, tos,
2157                                           net->loopback_dev->ifindex,
2158                                           dev, &spec_dst, &itag, skb->mark);
2159                 if (err < 0)
2160                         goto martian_source_keep_err;
2161                 if (err)
2162                         flags |= RTCF_DIRECTSRC;
2163                 spec_dst = daddr;
2164                 goto local_input;
2165         }
2166
2167         if (!IN_DEV_FORWARD(in_dev))
2168                 goto e_hostunreach;
2169         if (res.type != RTN_UNICAST)
2170                 goto martian_destination;
2171
2172         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2173 out:    return err;
2174
2175 brd_input:
2176         if (skb->protocol != htons(ETH_P_IP))
2177                 goto e_inval;
2178
2179         if (ipv4_is_zeronet(saddr))
2180                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2181         else {
2182                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2183                                           &itag, skb->mark);
2184                 if (err < 0)
2185                         goto martian_source_keep_err;
2186                 if (err)
2187                         flags |= RTCF_DIRECTSRC;
2188         }
2189         flags |= RTCF_BROADCAST;
2190         res.type = RTN_BROADCAST;
2191         RT_CACHE_STAT_INC(in_brd);
2192
2193 local_input:
2194         rth = dst_alloc(&ipv4_dst_ops);
2195         if (!rth)
2196                 goto e_nobufs;
2197
2198         rth->dst.output= ip_rt_bug;
2199         rth->dst.obsolete = -1;
2200         rth->rt_genid = rt_genid(net);
2201
2202         atomic_set(&rth->dst.__refcnt, 1);
2203         rth->dst.flags= DST_HOST;
2204         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2205                 rth->dst.flags |= DST_NOPOLICY;
2206         rth->fl.fl4_dst = daddr;
2207         rth->rt_dst     = daddr;
2208         rth->fl.fl4_tos = tos;
2209         rth->fl.mark    = skb->mark;
2210         rth->fl.fl4_src = saddr;
2211         rth->rt_src     = saddr;
2212 #ifdef CONFIG_NET_CLS_ROUTE
2213         rth->dst.tclassid = itag;
2214 #endif
2215         rth->rt_iif     =
2216         rth->fl.iif     = dev->ifindex;
2217         rth->dst.dev    = net->loopback_dev;
2218         dev_hold(rth->dst.dev);
2219         rth->rt_gateway = daddr;
2220         rth->rt_spec_dst= spec_dst;
2221         rth->dst.input= ip_local_deliver;
2222         rth->rt_flags   = flags|RTCF_LOCAL;
2223         if (res.type == RTN_UNREACHABLE) {
2224                 rth->dst.input= ip_error;
2225                 rth->dst.error= -err;
2226                 rth->rt_flags   &= ~RTCF_LOCAL;
2227         }
2228         rth->rt_type    = res.type;
2229         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2230         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2231         goto out;
2232
2233 no_route:
2234         RT_CACHE_STAT_INC(in_no_route);
2235         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2236         res.type = RTN_UNREACHABLE;
2237         if (err == -ESRCH)
2238                 err = -ENETUNREACH;
2239         goto local_input;
2240
2241         /*
2242          *      Do not cache martian addresses: they should be logged (RFC1812)
2243          */
2244 martian_destination:
2245         RT_CACHE_STAT_INC(in_martian_dst);
2246 #ifdef CONFIG_IP_ROUTE_VERBOSE
2247         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2248                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2249                         &daddr, &saddr, dev->name);
2250 #endif
2251
2252 e_hostunreach:
2253         err = -EHOSTUNREACH;
2254         goto out;
2255
2256 e_inval:
2257         err = -EINVAL;
2258         goto out;
2259
2260 e_nobufs:
2261         err = -ENOBUFS;
2262         goto out;
2263
2264 martian_source:
2265         err = -EINVAL;
2266 martian_source_keep_err:
2267         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2268         goto out;
2269 }
2270
2271 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2272                            u8 tos, struct net_device *dev, bool noref)
2273 {
2274         struct rtable * rth;
2275         unsigned        hash;
2276         int iif = dev->ifindex;
2277         struct net *net;
2278         int res;
2279
2280         net = dev_net(dev);
2281
2282         rcu_read_lock();
2283
2284         if (!rt_caching(net))
2285                 goto skip_cache;
2286
2287         tos &= IPTOS_RT_MASK;
2288         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2289
2290         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2291              rth = rcu_dereference(rth->dst.rt_next)) {
2292                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2293                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2294                      (rth->fl.iif ^ iif) |
2295                      rth->fl.oif |
2296                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2297                     rth->fl.mark == skb->mark &&
2298                     net_eq(dev_net(rth->dst.dev), net) &&
2299                     !rt_is_expired(rth)) {
2300                         if (noref) {
2301                                 dst_use_noref(&rth->dst, jiffies);
2302                                 skb_dst_set_noref(skb, &rth->dst);
2303                         } else {
2304                                 dst_use(&rth->dst, jiffies);
2305                                 skb_dst_set(skb, &rth->dst);
2306                         }
2307                         RT_CACHE_STAT_INC(in_hit);
2308                         rcu_read_unlock();
2309                         return 0;
2310                 }
2311                 RT_CACHE_STAT_INC(in_hlist_search);
2312         }
2313
2314 skip_cache:
2315         /* Multicast recognition logic is moved from route cache to here.
2316            The problem was that too many Ethernet cards have broken/missing
2317            hardware multicast filters :-( As result the host on multicasting
2318            network acquires a lot of useless route cache entries, sort of
2319            SDR messages from all the world. Now we try to get rid of them.
2320            Really, provided software IP multicast filter is organized
2321            reasonably (at least, hashed), it does not result in a slowdown
2322            comparing with route cache reject entries.
2323            Note, that multicast routers are not affected, because
2324            route cache entry is created eventually.
2325          */
2326         if (ipv4_is_multicast(daddr)) {
2327                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2328
2329                 if (in_dev) {
2330                         int our = ip_check_mc(in_dev, daddr, saddr,
2331                                               ip_hdr(skb)->protocol);
2332                         if (our
2333 #ifdef CONFIG_IP_MROUTE
2334                                 ||
2335                             (!ipv4_is_local_multicast(daddr) &&
2336                              IN_DEV_MFORWARD(in_dev))
2337 #endif
2338                            ) {
2339                                 int res = ip_route_input_mc(skb, daddr, saddr,
2340                                                             tos, dev, our);
2341                                 rcu_read_unlock();
2342                                 return res;
2343                         }
2344                 }
2345                 rcu_read_unlock();
2346                 return -EINVAL;
2347         }
2348         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2349         rcu_read_unlock();
2350         return res;
2351 }
2352 EXPORT_SYMBOL(ip_route_input_common);
2353
2354 /* called with rcu_read_lock() */
2355 static int __mkroute_output(struct rtable **result,
2356                             struct fib_result *res,
2357                             const struct flowi *fl,
2358                             const struct flowi *oldflp,
2359                             struct net_device *dev_out,
2360                             unsigned flags)
2361 {
2362         struct rtable *rth;
2363         struct in_device *in_dev;
2364         u32 tos = RT_FL_TOS(oldflp);
2365
2366         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2367                 return -EINVAL;
2368
2369         if (ipv4_is_lbcast(fl->fl4_dst))
2370                 res->type = RTN_BROADCAST;
2371         else if (ipv4_is_multicast(fl->fl4_dst))
2372                 res->type = RTN_MULTICAST;
2373         else if (ipv4_is_zeronet(fl->fl4_dst))
2374                 return -EINVAL;
2375
2376         if (dev_out->flags & IFF_LOOPBACK)
2377                 flags |= RTCF_LOCAL;
2378
2379         in_dev = __in_dev_get_rcu(dev_out);
2380         if (!in_dev)
2381                 return -EINVAL;
2382
2383         if (res->type == RTN_BROADCAST) {
2384                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2385                 res->fi = NULL;
2386         } else if (res->type == RTN_MULTICAST) {
2387                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2388                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2389                                  oldflp->proto))
2390                         flags &= ~RTCF_LOCAL;
2391                 /* If multicast route do not exist use
2392                  * default one, but do not gateway in this case.
2393                  * Yes, it is hack.
2394                  */
2395                 if (res->fi && res->prefixlen < 4)
2396                         res->fi = NULL;
2397         }
2398
2399
2400         rth = dst_alloc(&ipv4_dst_ops);
2401         if (!rth)
2402                 return -ENOBUFS;
2403
2404         atomic_set(&rth->dst.__refcnt, 1);
2405         rth->dst.flags= DST_HOST;
2406         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2407                 rth->dst.flags |= DST_NOXFRM;
2408         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2409                 rth->dst.flags |= DST_NOPOLICY;
2410
2411         rth->fl.fl4_dst = oldflp->fl4_dst;
2412         rth->fl.fl4_tos = tos;
2413         rth->fl.fl4_src = oldflp->fl4_src;
2414         rth->fl.oif     = oldflp->oif;
2415         rth->fl.mark    = oldflp->mark;
2416         rth->rt_dst     = fl->fl4_dst;
2417         rth->rt_src     = fl->fl4_src;
2418         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2419         /* get references to the devices that are to be hold by the routing
2420            cache entry */
2421         rth->dst.dev    = dev_out;
2422         dev_hold(dev_out);
2423         rth->rt_gateway = fl->fl4_dst;
2424         rth->rt_spec_dst= fl->fl4_src;
2425
2426         rth->dst.output=ip_output;
2427         rth->dst.obsolete = -1;
2428         rth->rt_genid = rt_genid(dev_net(dev_out));
2429
2430         RT_CACHE_STAT_INC(out_slow_tot);
2431
2432         if (flags & RTCF_LOCAL) {
2433                 rth->dst.input = ip_local_deliver;
2434                 rth->rt_spec_dst = fl->fl4_dst;
2435         }
2436         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2437                 rth->rt_spec_dst = fl->fl4_src;
2438                 if (flags & RTCF_LOCAL &&
2439                     !(dev_out->flags & IFF_LOOPBACK)) {
2440                         rth->dst.output = ip_mc_output;
2441                         RT_CACHE_STAT_INC(out_slow_mc);
2442                 }
2443 #ifdef CONFIG_IP_MROUTE
2444                 if (res->type == RTN_MULTICAST) {
2445                         if (IN_DEV_MFORWARD(in_dev) &&
2446                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2447                                 rth->dst.input = ip_mr_input;
2448                                 rth->dst.output = ip_mc_output;
2449                         }
2450                 }
2451 #endif
2452         }
2453
2454         rt_set_nexthop(rth, res, 0);
2455
2456         rth->rt_flags = flags;
2457         *result = rth;
2458         return 0;
2459 }
2460
2461 /* called with rcu_read_lock() */
2462 static int ip_mkroute_output(struct rtable **rp,
2463                              struct fib_result *res,
2464                              const struct flowi *fl,
2465                              const struct flowi *oldflp,
2466                              struct net_device *dev_out,
2467                              unsigned flags)
2468 {
2469         struct rtable *rth = NULL;
2470         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2471         unsigned hash;
2472         if (err == 0) {
2473                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2474                                rt_genid(dev_net(dev_out)));
2475                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2476         }
2477
2478         return err;
2479 }
2480
2481 /*
2482  * Major route resolver routine.
2483  * called with rcu_read_lock();
2484  */
2485
2486 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2487                                 const struct flowi *oldflp)
2488 {
2489         u32 tos = RT_FL_TOS(oldflp);
2490         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2491                             .fl4_src = oldflp->fl4_src,
2492                             .fl4_tos = tos & IPTOS_RT_MASK,
2493                             .fl4_scope = ((tos & RTO_ONLINK) ?
2494                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2495                             .mark = oldflp->mark,
2496                             .iif = net->loopback_dev->ifindex,
2497                             .oif = oldflp->oif };
2498         struct fib_result res;
2499         unsigned int flags = 0;
2500         struct net_device *dev_out = NULL;
2501         int err;
2502
2503
2504         res.fi          = NULL;
2505 #ifdef CONFIG_IP_MULTIPLE_TABLES
2506         res.r           = NULL;
2507 #endif
2508
2509         if (oldflp->fl4_src) {
2510                 err = -EINVAL;
2511                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2512                     ipv4_is_lbcast(oldflp->fl4_src) ||
2513                     ipv4_is_zeronet(oldflp->fl4_src))
2514                         goto out;
2515
2516                 /* I removed check for oif == dev_out->oif here.
2517                    It was wrong for two reasons:
2518                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2519                       is assigned to multiple interfaces.
2520                    2. Moreover, we are allowed to send packets with saddr
2521                       of another iface. --ANK
2522                  */
2523
2524                 if (oldflp->oif == 0 &&
2525                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2526                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2527                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2528                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2529                         if (dev_out == NULL)
2530                                 goto out;
2531
2532                         /* Special hack: user can direct multicasts
2533                            and limited broadcast via necessary interface
2534                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2535                            This hack is not just for fun, it allows
2536                            vic,vat and friends to work.
2537                            They bind socket to loopback, set ttl to zero
2538                            and expect that it will work.
2539                            From the viewpoint of routing cache they are broken,
2540                            because we are not allowed to build multicast path
2541                            with loopback source addr (look, routing cache
2542                            cannot know, that ttl is zero, so that packet
2543                            will not leave this host and route is valid).
2544                            Luckily, this hack is good workaround.
2545                          */
2546
2547                         fl.oif = dev_out->ifindex;
2548                         goto make_route;
2549                 }
2550
2551                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2552                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2553                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2554                                 goto out;
2555                 }
2556         }
2557
2558
2559         if (oldflp->oif) {
2560                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2561                 err = -ENODEV;
2562                 if (dev_out == NULL)
2563                         goto out;
2564
2565                 /* RACE: Check return value of inet_select_addr instead. */
2566                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2567                         err = -ENETUNREACH;
2568                         goto out;
2569                 }
2570                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2571                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2572                         if (!fl.fl4_src)
2573                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2574                                                               RT_SCOPE_LINK);
2575                         goto make_route;
2576                 }
2577                 if (!fl.fl4_src) {
2578                         if (ipv4_is_multicast(oldflp->fl4_dst))
2579                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2580                                                               fl.fl4_scope);
2581                         else if (!oldflp->fl4_dst)
2582                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2583                                                               RT_SCOPE_HOST);
2584                 }
2585         }
2586
2587         if (!fl.fl4_dst) {
2588                 fl.fl4_dst = fl.fl4_src;
2589                 if (!fl.fl4_dst)
2590                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2591                 dev_out = net->loopback_dev;
2592                 fl.oif = net->loopback_dev->ifindex;
2593                 res.type = RTN_LOCAL;
2594                 flags |= RTCF_LOCAL;
2595                 goto make_route;
2596         }
2597
2598         if (fib_lookup(net, &fl, &res)) {
2599                 res.fi = NULL;
2600                 if (oldflp->oif) {
2601                         /* Apparently, routing tables are wrong. Assume,
2602                            that the destination is on link.
2603
2604                            WHY? DW.
2605                            Because we are allowed to send to iface
2606                            even if it has NO routes and NO assigned
2607                            addresses. When oif is specified, routing
2608                            tables are looked up with only one purpose:
2609                            to catch if destination is gatewayed, rather than
2610                            direct. Moreover, if MSG_DONTROUTE is set,
2611                            we send packet, ignoring both routing tables
2612                            and ifaddr state. --ANK
2613
2614
2615                            We could make it even if oif is unknown,
2616                            likely IPv6, but we do not.
2617                          */
2618
2619                         if (fl.fl4_src == 0)
2620                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2621                                                               RT_SCOPE_LINK);
2622                         res.type = RTN_UNICAST;
2623                         goto make_route;
2624                 }
2625                 err = -ENETUNREACH;
2626                 goto out;
2627         }
2628
2629         if (res.type == RTN_LOCAL) {
2630                 if (!fl.fl4_src) {
2631                         if (res.fi->fib_prefsrc)
2632                                 fl.fl4_src = res.fi->fib_prefsrc;
2633                         else
2634                                 fl.fl4_src = fl.fl4_dst;
2635                 }
2636                 dev_out = net->loopback_dev;
2637                 fl.oif = dev_out->ifindex;
2638                 res.fi = NULL;
2639                 flags |= RTCF_LOCAL;
2640                 goto make_route;
2641         }
2642
2643 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2644         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2645                 fib_select_multipath(&fl, &res);
2646         else
2647 #endif
2648         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2649                 fib_select_default(net, &fl, &res);
2650
2651         if (!fl.fl4_src)
2652                 fl.fl4_src = FIB_RES_PREFSRC(res);
2653
2654         dev_out = FIB_RES_DEV(res);
2655         fl.oif = dev_out->ifindex;
2656
2657
2658 make_route:
2659         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2660
2661 out:    return err;
2662 }
2663
2664 int __ip_route_output_key(struct net *net, struct rtable **rp,
2665                           const struct flowi *flp)
2666 {
2667         unsigned int hash;
2668         int res;
2669         struct rtable *rth;
2670
2671         if (!rt_caching(net))
2672                 goto slow_output;
2673
2674         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2675
2676         rcu_read_lock_bh();
2677         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2678                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2679                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2680                     rth->fl.fl4_src == flp->fl4_src &&
2681                     rt_is_output_route(rth) &&
2682                     rth->fl.oif == flp->oif &&
2683                     rth->fl.mark == flp->mark &&
2684                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2685                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2686                     net_eq(dev_net(rth->dst.dev), net) &&
2687                     !rt_is_expired(rth)) {
2688                         dst_use(&rth->dst, jiffies);
2689                         RT_CACHE_STAT_INC(out_hit);
2690                         rcu_read_unlock_bh();
2691                         *rp = rth;
2692                         return 0;
2693                 }
2694                 RT_CACHE_STAT_INC(out_hlist_search);
2695         }
2696         rcu_read_unlock_bh();
2697
2698 slow_output:
2699         rcu_read_lock();
2700         res = ip_route_output_slow(net, rp, flp);
2701         rcu_read_unlock();
2702         return res;
2703 }
2704 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705
2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707 {
2708         return NULL;
2709 }
2710
2711 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2712 {
2713         return 0;
2714 }
2715
2716 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2717 {
2718 }
2719
2720 static struct dst_ops ipv4_dst_blackhole_ops = {
2721         .family                 =       AF_INET,
2722         .protocol               =       cpu_to_be16(ETH_P_IP),
2723         .destroy                =       ipv4_dst_destroy,
2724         .check                  =       ipv4_blackhole_dst_check,
2725         .default_mtu            =       ipv4_blackhole_default_mtu,
2726         .default_advmss         =       ipv4_default_advmss,
2727         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2728 };
2729
2730
2731 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2732 {
2733         struct rtable *ort = *rp;
2734         struct rtable *rt = (struct rtable *)
2735                 dst_alloc(&ipv4_dst_blackhole_ops);
2736
2737         if (rt) {
2738                 struct dst_entry *new = &rt->dst;
2739
2740                 atomic_set(&new->__refcnt, 1);
2741                 new->__use = 1;
2742                 new->input = dst_discard;
2743                 new->output = dst_discard;
2744                 dst_copy_metrics(new, &ort->dst);
2745
2746                 new->dev = ort->dst.dev;
2747                 if (new->dev)
2748                         dev_hold(new->dev);
2749
2750                 rt->fl = ort->fl;
2751
2752                 rt->rt_genid = rt_genid(net);
2753                 rt->rt_flags = ort->rt_flags;
2754                 rt->rt_type = ort->rt_type;
2755                 rt->rt_dst = ort->rt_dst;
2756                 rt->rt_src = ort->rt_src;
2757                 rt->rt_iif = ort->rt_iif;
2758                 rt->rt_gateway = ort->rt_gateway;
2759                 rt->rt_spec_dst = ort->rt_spec_dst;
2760                 rt->peer = ort->peer;
2761                 if (rt->peer)
2762                         atomic_inc(&rt->peer->refcnt);
2763
2764                 dst_free(new);
2765         }
2766
2767         dst_release(&(*rp)->dst);
2768         *rp = rt;
2769         return rt ? 0 : -ENOMEM;
2770 }
2771
2772 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2773                          struct sock *sk, int flags)
2774 {
2775         int err;
2776
2777         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2778                 return err;
2779
2780         if (flp->proto) {
2781                 if (!flp->fl4_src)
2782                         flp->fl4_src = (*rp)->rt_src;
2783                 if (!flp->fl4_dst)
2784                         flp->fl4_dst = (*rp)->rt_dst;
2785                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2786                                     flags ? XFRM_LOOKUP_WAIT : 0);
2787                 if (err == -EREMOTE)
2788                         err = ipv4_dst_blackhole(net, rp, flp);
2789
2790                 return err;
2791         }
2792
2793         return 0;
2794 }
2795 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2796
2797 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2798 {
2799         return ip_route_output_flow(net, rp, flp, NULL, 0);
2800 }
2801 EXPORT_SYMBOL(ip_route_output_key);
2802
2803 static int rt_fill_info(struct net *net,
2804                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2805                         int nowait, unsigned int flags)
2806 {
2807         struct rtable *rt = skb_rtable(skb);
2808         struct rtmsg *r;
2809         struct nlmsghdr *nlh;
2810         long expires;
2811         u32 id = 0, ts = 0, tsage = 0, error;
2812
2813         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2814         if (nlh == NULL)
2815                 return -EMSGSIZE;
2816
2817         r = nlmsg_data(nlh);
2818         r->rtm_family    = AF_INET;
2819         r->rtm_dst_len  = 32;
2820         r->rtm_src_len  = 0;
2821         r->rtm_tos      = rt->fl.fl4_tos;
2822         r->rtm_table    = RT_TABLE_MAIN;
2823         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2824         r->rtm_type     = rt->rt_type;
2825         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2826         r->rtm_protocol = RTPROT_UNSPEC;
2827         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2828         if (rt->rt_flags & RTCF_NOTIFY)
2829                 r->rtm_flags |= RTM_F_NOTIFY;
2830
2831         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2832
2833         if (rt->fl.fl4_src) {
2834                 r->rtm_src_len = 32;
2835                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2836         }
2837         if (rt->dst.dev)
2838                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2839 #ifdef CONFIG_NET_CLS_ROUTE
2840         if (rt->dst.tclassid)
2841                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2842 #endif
2843         if (rt_is_input_route(rt))
2844                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2845         else if (rt->rt_src != rt->fl.fl4_src)
2846                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2847
2848         if (rt->rt_dst != rt->rt_gateway)
2849                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2850
2851         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2852                 goto nla_put_failure;
2853
2854         if (rt->fl.mark)
2855                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2856
2857         error = rt->dst.error;
2858         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2859         if (rt->peer) {
2860                 inet_peer_refcheck(rt->peer);
2861                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2862                 if (rt->peer->tcp_ts_stamp) {
2863                         ts = rt->peer->tcp_ts;
2864                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2865                 }
2866         }
2867
2868         if (rt_is_input_route(rt)) {
2869 #ifdef CONFIG_IP_MROUTE
2870                 __be32 dst = rt->rt_dst;
2871
2872                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2873                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2874                         int err = ipmr_get_route(net, skb, r, nowait);
2875                         if (err <= 0) {
2876                                 if (!nowait) {
2877                                         if (err == 0)
2878                                                 return 0;
2879                                         goto nla_put_failure;
2880                                 } else {
2881                                         if (err == -EMSGSIZE)
2882                                                 goto nla_put_failure;
2883                                         error = err;
2884                                 }
2885                         }
2886                 } else
2887 #endif
2888                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2889         }
2890
2891         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2892                                expires, error) < 0)
2893                 goto nla_put_failure;
2894
2895         return nlmsg_end(skb, nlh);
2896
2897 nla_put_failure:
2898         nlmsg_cancel(skb, nlh);
2899         return -EMSGSIZE;
2900 }
2901
2902 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2903 {
2904         struct net *net = sock_net(in_skb->sk);
2905         struct rtmsg *rtm;
2906         struct nlattr *tb[RTA_MAX+1];
2907         struct rtable *rt = NULL;
2908         __be32 dst = 0;
2909         __be32 src = 0;
2910         u32 iif;
2911         int err;
2912         int mark;
2913         struct sk_buff *skb;
2914
2915         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2916         if (err < 0)
2917                 goto errout;
2918
2919         rtm = nlmsg_data(nlh);
2920
2921         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2922         if (skb == NULL) {
2923                 err = -ENOBUFS;
2924                 goto errout;
2925         }
2926
2927         /* Reserve room for dummy headers, this skb can pass
2928            through good chunk of routing engine.
2929          */
2930         skb_reset_mac_header(skb);
2931         skb_reset_network_header(skb);
2932
2933         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2934         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2935         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2936
2937         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2940         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2941
2942         if (iif) {
2943                 struct net_device *dev;
2944
2945                 dev = __dev_get_by_index(net, iif);
2946                 if (dev == NULL) {
2947                         err = -ENODEV;
2948                         goto errout_free;
2949                 }
2950
2951                 skb->protocol   = htons(ETH_P_IP);
2952                 skb->dev        = dev;
2953                 skb->mark       = mark;
2954                 local_bh_disable();
2955                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2956                 local_bh_enable();
2957
2958                 rt = skb_rtable(skb);
2959                 if (err == 0 && rt->dst.error)
2960                         err = -rt->dst.error;
2961         } else {
2962                 struct flowi fl = {
2963                         .fl4_dst = dst,
2964                         .fl4_src = src,
2965                         .fl4_tos = rtm->rtm_tos,
2966                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2967                         .mark = mark,
2968                 };
2969                 err = ip_route_output_key(net, &rt, &fl);
2970         }
2971
2972         if (err)
2973                 goto errout_free;
2974
2975         skb_dst_set(skb, &rt->dst);
2976         if (rtm->rtm_flags & RTM_F_NOTIFY)
2977                 rt->rt_flags |= RTCF_NOTIFY;
2978
2979         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2980                            RTM_NEWROUTE, 0, 0);
2981         if (err <= 0)
2982                 goto errout_free;
2983
2984         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2985 errout:
2986         return err;
2987
2988 errout_free:
2989         kfree_skb(skb);
2990         goto errout;
2991 }
2992
2993 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2994 {
2995         struct rtable *rt;
2996         int h, s_h;
2997         int idx, s_idx;
2998         struct net *net;
2999
3000         net = sock_net(skb->sk);
3001
3002         s_h = cb->args[0];
3003         if (s_h < 0)
3004                 s_h = 0;
3005         s_idx = idx = cb->args[1];
3006         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3007                 if (!rt_hash_table[h].chain)
3008                         continue;
3009                 rcu_read_lock_bh();
3010                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3011                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3012                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3013                                 continue;
3014                         if (rt_is_expired(rt))
3015                                 continue;
3016                         skb_dst_set_noref(skb, &rt->dst);
3017                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3018                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3019                                          1, NLM_F_MULTI) <= 0) {
3020                                 skb_dst_drop(skb);
3021                                 rcu_read_unlock_bh();
3022                                 goto done;
3023                         }
3024                         skb_dst_drop(skb);
3025                 }
3026                 rcu_read_unlock_bh();
3027         }
3028
3029 done:
3030         cb->args[0] = h;
3031         cb->args[1] = idx;
3032         return skb->len;
3033 }
3034
3035 void ip_rt_multicast_event(struct in_device *in_dev)
3036 {
3037         rt_cache_flush(dev_net(in_dev->dev), 0);
3038 }
3039
3040 #ifdef CONFIG_SYSCTL
3041 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3042                                         void __user *buffer,
3043                                         size_t *lenp, loff_t *ppos)
3044 {
3045         if (write) {
3046                 int flush_delay;
3047                 ctl_table ctl;
3048                 struct net *net;
3049
3050                 memcpy(&ctl, __ctl, sizeof(ctl));
3051                 ctl.data = &flush_delay;
3052                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3053
3054                 net = (struct net *)__ctl->extra1;
3055                 rt_cache_flush(net, flush_delay);
3056                 return 0;
3057         }
3058
3059         return -EINVAL;
3060 }
3061
3062 static ctl_table ipv4_route_table[] = {
3063         {
3064                 .procname       = "gc_thresh",
3065                 .data           = &ipv4_dst_ops.gc_thresh,
3066                 .maxlen         = sizeof(int),
3067                 .mode           = 0644,
3068                 .proc_handler   = proc_dointvec,
3069         },
3070         {
3071                 .procname       = "max_size",
3072                 .data           = &ip_rt_max_size,
3073                 .maxlen         = sizeof(int),
3074                 .mode           = 0644,
3075                 .proc_handler   = proc_dointvec,
3076         },
3077         {
3078                 /*  Deprecated. Use gc_min_interval_ms */
3079
3080                 .procname       = "gc_min_interval",
3081                 .data           = &ip_rt_gc_min_interval,
3082                 .maxlen         = sizeof(int),
3083                 .mode           = 0644,
3084                 .proc_handler   = proc_dointvec_jiffies,
3085         },
3086         {
3087                 .procname       = "gc_min_interval_ms",
3088                 .data           = &ip_rt_gc_min_interval,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec_ms_jiffies,
3092         },
3093         {
3094                 .procname       = "gc_timeout",
3095                 .data           = &ip_rt_gc_timeout,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec_jiffies,
3099         },
3100         {
3101                 .procname       = "gc_interval",
3102                 .data           = &ip_rt_gc_interval,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec_jiffies,
3106         },
3107         {
3108                 .procname       = "redirect_load",
3109                 .data           = &ip_rt_redirect_load,
3110                 .maxlen         = sizeof(int),
3111                 .mode           = 0644,
3112                 .proc_handler   = proc_dointvec,
3113         },
3114         {
3115                 .procname       = "redirect_number",
3116                 .data           = &ip_rt_redirect_number,
3117                 .maxlen         = sizeof(int),
3118                 .mode           = 0644,
3119                 .proc_handler   = proc_dointvec,
3120         },
3121         {
3122                 .procname       = "redirect_silence",
3123                 .data           = &ip_rt_redirect_silence,
3124                 .maxlen         = sizeof(int),
3125                 .mode           = 0644,
3126                 .proc_handler   = proc_dointvec,
3127         },
3128         {
3129                 .procname       = "error_cost",
3130                 .data           = &ip_rt_error_cost,
3131                 .maxlen         = sizeof(int),
3132                 .mode           = 0644,
3133                 .proc_handler   = proc_dointvec,
3134         },
3135         {
3136                 .procname       = "error_burst",
3137                 .data           = &ip_rt_error_burst,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec,
3141         },
3142         {
3143                 .procname       = "gc_elasticity",
3144                 .data           = &ip_rt_gc_elasticity,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec,
3148         },
3149         {
3150                 .procname       = "mtu_expires",
3151                 .data           = &ip_rt_mtu_expires,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec_jiffies,
3155         },
3156         {
3157                 .procname       = "min_pmtu",
3158                 .data           = &ip_rt_min_pmtu,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec,
3162         },
3163         {
3164                 .procname       = "min_adv_mss",
3165                 .data           = &ip_rt_min_advmss,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec,
3169         },
3170         { }
3171 };
3172
3173 static struct ctl_table empty[1];
3174
3175 static struct ctl_table ipv4_skeleton[] =
3176 {
3177         { .procname = "route",
3178           .mode = 0555, .child = ipv4_route_table},
3179         { .procname = "neigh",
3180           .mode = 0555, .child = empty},
3181         { }
3182 };
3183
3184 static __net_initdata struct ctl_path ipv4_path[] = {
3185         { .procname = "net", },
3186         { .procname = "ipv4", },
3187         { },
3188 };
3189
3190 static struct ctl_table ipv4_route_flush_table[] = {
3191         {
3192                 .procname       = "flush",
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0200,
3195                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3196         },
3197         { },
3198 };
3199
3200 static __net_initdata struct ctl_path ipv4_route_path[] = {
3201         { .procname = "net", },
3202         { .procname = "ipv4", },
3203         { .procname = "route", },
3204         { },
3205 };
3206
3207 static __net_init int sysctl_route_net_init(struct net *net)
3208 {
3209         struct ctl_table *tbl;
3210
3211         tbl = ipv4_route_flush_table;
3212         if (!net_eq(net, &init_net)) {
3213                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3214                 if (tbl == NULL)
3215                         goto err_dup;
3216         }
3217         tbl[0].extra1 = net;
3218
3219         net->ipv4.route_hdr =
3220                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3221         if (net->ipv4.route_hdr == NULL)
3222                 goto err_reg;
3223         return 0;
3224
3225 err_reg:
3226         if (tbl != ipv4_route_flush_table)
3227                 kfree(tbl);
3228 err_dup:
3229         return -ENOMEM;
3230 }
3231
3232 static __net_exit void sysctl_route_net_exit(struct net *net)
3233 {
3234         struct ctl_table *tbl;
3235
3236         tbl = net->ipv4.route_hdr->ctl_table_arg;
3237         unregister_net_sysctl_table(net->ipv4.route_hdr);
3238         BUG_ON(tbl == ipv4_route_flush_table);
3239         kfree(tbl);
3240 }
3241
3242 static __net_initdata struct pernet_operations sysctl_route_ops = {
3243         .init = sysctl_route_net_init,
3244         .exit = sysctl_route_net_exit,
3245 };
3246 #endif
3247
3248 static __net_init int rt_genid_init(struct net *net)
3249 {
3250         get_random_bytes(&net->ipv4.rt_genid,
3251                          sizeof(net->ipv4.rt_genid));
3252         return 0;
3253 }
3254
3255 static __net_initdata struct pernet_operations rt_genid_ops = {
3256         .init = rt_genid_init,
3257 };
3258
3259
3260 #ifdef CONFIG_NET_CLS_ROUTE
3261 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3262 #endif /* CONFIG_NET_CLS_ROUTE */
3263
3264 static __initdata unsigned long rhash_entries;
3265 static int __init set_rhash_entries(char *str)
3266 {
3267         if (!str)
3268                 return 0;
3269         rhash_entries = simple_strtoul(str, &str, 0);
3270         return 1;
3271 }
3272 __setup("rhash_entries=", set_rhash_entries);
3273
3274 int __init ip_rt_init(void)
3275 {
3276         int rc = 0;
3277
3278 #ifdef CONFIG_NET_CLS_ROUTE
3279         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3280         if (!ip_rt_acct)
3281                 panic("IP: failed to allocate ip_rt_acct\n");
3282 #endif
3283
3284         ipv4_dst_ops.kmem_cachep =
3285                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3286                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3287
3288         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3289
3290         if (dst_entries_init(&ipv4_dst_ops) < 0)
3291                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3292
3293         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3294                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3295
3296         rt_hash_table = (struct rt_hash_bucket *)
3297                 alloc_large_system_hash("IP route cache",
3298                                         sizeof(struct rt_hash_bucket),
3299                                         rhash_entries,
3300                                         (totalram_pages >= 128 * 1024) ?
3301                                         15 : 17,
3302                                         0,
3303                                         &rt_hash_log,
3304                                         &rt_hash_mask,
3305                                         rhash_entries ? 0 : 512 * 1024);
3306         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3307         rt_hash_lock_init();
3308
3309         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3310         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3311
3312         devinet_init();
3313         ip_fib_init();
3314
3315         /* All the timers, started at system startup tend
3316            to synchronize. Perturb it a bit.
3317          */
3318         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3319         expires_ljiffies = jiffies;
3320         schedule_delayed_work(&expires_work,
3321                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3322
3323         if (ip_rt_proc_init())
3324                 printk(KERN_ERR "Unable to create route proc files\n");
3325 #ifdef CONFIG_XFRM
3326         xfrm_init();
3327         xfrm4_init(ip_rt_max_size);
3328 #endif
3329         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3330
3331 #ifdef CONFIG_SYSCTL
3332         register_pernet_subsys(&sysctl_route_ops);
3333 #endif
3334         register_pernet_subsys(&rt_genid_ops);
3335         return rc;
3336 }
3337
3338 #ifdef CONFIG_SYSCTL
3339 /*
3340  * We really need to sanitize the damn ipv4 init order, then all
3341  * this nonsense will go away.
3342  */
3343 void __init ip_static_sysctl_init(void)
3344 {
3345         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3346 }
3347 #endif