net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <linux/reserve.h>
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 static struct delayed_work expires_work;
 136 static unsigned long expires_ljiffies;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 145                                          struct net_device *dev, int how);
 146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 147 static void              ipv4_link_failure(struct sk_buff *skb);
 148 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 149 static int rt_garbage_collect(struct dst_ops *ops);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228 #include <linux/reserve.h>
 229
 230 static spinlock_t       *rt_hash_locks;
 231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 232
 233 static __init void rt_hash_lock_init(void)
 234 {
 235         int i;
 236
 237         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 238                         GFP_KERNEL);
 239         if (!rt_hash_locks)
 240                 panic("IP: failed to allocate rt_hash_locks\n");
 241
 242         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 243                 spin_lock_init(&rt_hash_locks[i]);
 244 }
 245 #else
 246 # define rt_hash_lock_addr(slot) NULL
 247
 248 static inline void rt_hash_lock_init(void)
 249 {
 250 }
 251 #endif
 252
 253 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 254 static unsigned                 rt_hash_mask __read_mostly;
 255 static unsigned int             rt_hash_log  __read_mostly;
 256
 257 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 258 #define RT_CACHE_STAT_INC(field) \
 259         (__raw_get_cpu_var(rt_cache_stat).field++)
 260
 261 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 262                                    int genid)
 263 {
 264         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 static struct mem_reserve ipv4_route_reserve;
 275
 276 static struct mem_reserve ipv4_route_reserve;
 277
 278 #ifdef CONFIG_PROC_FS
 279 struct rt_cache_iter_state {
 280         struct seq_net_private p;
 281         int bucket;
 282         int genid;
 283 };
 284
 285 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 286 {
 287         struct rt_cache_iter_state *st = seq->private;
 288         struct rtable *r = NULL;
 289
 290         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 291                 if (!rt_hash_table[st->bucket].chain)
 292                         continue;
 293                 rcu_read_lock_bh();
 294                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 295                 while (r) {
 296                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 297                             r->rt_genid == st->genid)
 298                                 return r;
 299                         r = rcu_dereference_bh(r->u.dst.rt_next);
 300                 }
 301                 rcu_read_unlock_bh();
 302         }
 303         return r;
 304 }
 305
 306 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 307                                           struct rtable *r)
 308 {
 309         struct rt_cache_iter_state *st = seq->private;
 310
 311         r = r->u.dst.rt_next;
 312         while (!r) {
 313                 rcu_read_unlock_bh();
 314                 do {
 315                         if (--st->bucket < 0)
 316                                 return NULL;
 317                 } while (!rt_hash_table[st->bucket].chain);
 318                 rcu_read_lock_bh();
 319                 r = rt_hash_table[st->bucket].chain;
 320         }
 321         return rcu_dereference_bh(r);
 322 }
 323
 324 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 325                                         struct rtable *r)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 329                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 330                         continue;
 331                 if (r->rt_genid == st->genid)
 332                         break;
 333         }
 334         return r;
 335 }
 336
 337 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 338 {
 339         struct rtable *r = rt_cache_get_first(seq);
 340
 341         if (r)
 342                 while (pos && (r = rt_cache_get_next(seq, r)))
 343                         --pos;
 344         return pos ? NULL : r;
 345 }
 346
 347 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 348 {
 349         struct rt_cache_iter_state *st = seq->private;
 350         if (*pos)
 351                 return rt_cache_get_idx(seq, *pos - 1);
 352         st->genid = rt_genid(seq_file_net(seq));
 353         return SEQ_START_TOKEN;
 354 }
 355
 356 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 357 {
 358         struct rtable *r;
 359
 360         if (v == SEQ_START_TOKEN)
 361                 r = rt_cache_get_first(seq);
 362         else
 363                 r = rt_cache_get_next(seq, v);
 364         ++*pos;
 365         return r;
 366 }
 367
 368 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 369 {
 370         if (v && v != SEQ_START_TOKEN)
 371                 rcu_read_unlock_bh();
 372 }
 373
 374 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 375 {
 376         if (v == SEQ_START_TOKEN)
 377                 seq_printf(seq, "%-127s\n",
 378                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 379                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 380                            "HHUptod\tSpecDst");
 381         else {
 382                 struct rtable *r = v;
 383                 int len;
 384
 385                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 386                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 387                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 388                         (__force u32)r->rt_dst,
 389                         (__force u32)r->rt_gateway,
 390                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 391                         r->u.dst.__use, 0, (__force u32)r->rt_src,
 392                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 393                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 394                         dst_metric(&r->u.dst, RTAX_WINDOW),
 395                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 396                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 397                         r->fl.fl4_tos,
 398                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 399                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 400                                        dev_queue_xmit) : 0,
 401                         r->rt_spec_dst, &len);
 402
 403                 seq_printf(seq, "%*s\n", 127 - len, "");
 404         }
 405         return 0;
 406 }
 407
 408 static struct mutex ipv4_route_lock;
 409
 410 static int
 411 proc_dointvec_route(struct ctl_table *table, int write, void __user *buffer,
 412                 size_t *lenp, loff_t *ppos)
 413 {
 414         ctl_table tmp = *table;
 415         int new_size, ret;
 416
 417         mutex_lock(&ipv4_route_lock);
 418         if (write) {
 419                 tmp.data = &new_size;
 420                 table = &tmp;
 421         }
 422
 423         ret = proc_dointvec(table, write, buffer, lenp, ppos);
 424
 425         if (!ret && write) {
 426                 ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
 427                                 ipv4_dst_ops.kmem_cachep, new_size);
 428                 if (!ret)
 429                         ip_rt_max_size = new_size;
 430         }
 431         mutex_unlock(&ipv4_route_lock);
 432
 433         return ret;
 434 }
 435
 436 static struct mutex ipv4_route_lock;
 437
 438 static const struct seq_operations rt_cache_seq_ops = {
 439         .start  = rt_cache_seq_start,
 440         .next   = rt_cache_seq_next,
 441         .stop   = rt_cache_seq_stop,
 442         .show   = rt_cache_seq_show,
 443 };
 444
 445 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 446 {
 447         return seq_open_net(inode, file, &rt_cache_seq_ops,
 448                         sizeof(struct rt_cache_iter_state));
 449 }
 450
 451 static const struct file_operations rt_cache_seq_fops = {
 452         .owner   = THIS_MODULE,
 453         .open    = rt_cache_seq_open,
 454         .read    = seq_read,
 455         .llseek  = seq_lseek,
 456         .release = seq_release_net,
 457 };
 458
 459
 460 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 461 {
 462         int cpu;
 463
 464         if (*pos == 0)
 465                 return SEQ_START_TOKEN;
 466
 467         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 468                 if (!cpu_possible(cpu))
 469                         continue;
 470                 *pos = cpu+1;
 471                 return &per_cpu(rt_cache_stat, cpu);
 472         }
 473         return NULL;
 474 }
 475
 476 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 477 {
 478         int cpu;
 479
 480         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 481                 if (!cpu_possible(cpu))
 482                         continue;
 483                 *pos = cpu+1;
 484                 return &per_cpu(rt_cache_stat, cpu);
 485         }
 486         return NULL;
 487
 488 }
 489
 490 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 491 {
 492
 493 }
 494
 495 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 496 {
 497         struct rt_cache_stat *st = v;
 498
 499         if (v == SEQ_START_TOKEN) {
 500                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 501                 return 0;
 502         }
 503
 504         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 505                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 506                    atomic_read(&ipv4_dst_ops.entries),
 507                    st->in_hit,
 508                    st->in_slow_tot,
 509                    st->in_slow_mc,
 510                    st->in_no_route,
 511                    st->in_brd,
 512                    st->in_martian_dst,
 513                    st->in_martian_src,
 514
 515                    st->out_hit,
 516                    st->out_slow_tot,
 517                    st->out_slow_mc,
 518
 519                    st->gc_total,
 520                    st->gc_ignored,
 521                    st->gc_goal_miss,
 522                    st->gc_dst_overflow,
 523                    st->in_hlist_search,
 524                    st->out_hlist_search
 525                 );
 526         return 0;
 527 }
 528
 529 static const struct seq_operations rt_cpu_seq_ops = {
 530         .start  = rt_cpu_seq_start,
 531         .next   = rt_cpu_seq_next,
 532         .stop   = rt_cpu_seq_stop,
 533         .show   = rt_cpu_seq_show,
 534 };
 535
 536
 537 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 538 {
 539         return seq_open(file, &rt_cpu_seq_ops);
 540 }
 541
 542 static const struct file_operations rt_cpu_seq_fops = {
 543         .owner   = THIS_MODULE,
 544         .open    = rt_cpu_seq_open,
 545         .read    = seq_read,
 546         .llseek  = seq_lseek,
 547         .release = seq_release,
 548 };
 549
 550 #ifdef CONFIG_NET_CLS_ROUTE
 551 static int rt_acct_proc_show(struct seq_file *m, void *v)
 552 {
 553         struct ip_rt_acct *dst, *src;
 554         unsigned int i, j;
 555
 556         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 557         if (!dst)
 558                 return -ENOMEM;
 559
 560         for_each_possible_cpu(i) {
 561                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 562                 for (j = 0; j < 256; j++) {
 563                         dst[j].o_bytes   += src[j].o_bytes;
 564                         dst[j].o_packets += src[j].o_packets;
 565                         dst[j].i_bytes   += src[j].i_bytes;
 566                         dst[j].i_packets += src[j].i_packets;
 567                 }
 568         }
 569
 570         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 571         kfree(dst);
 572         return 0;
 573 }
 574
 575 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 576 {
 577         return single_open(file, rt_acct_proc_show, NULL);
 578 }
 579
 580 static const struct file_operations rt_acct_proc_fops = {
 581         .owner          = THIS_MODULE,
 582         .open           = rt_acct_proc_open,
 583         .read           = seq_read,
 584         .llseek         = seq_lseek,
 585         .release        = single_release,
 586 };
 587 #endif
 588
 589 static int __net_init ip_rt_do_proc_init(struct net *net)
 590 {
 591         struct proc_dir_entry *pde;
 592
 593         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 594                         &rt_cache_seq_fops);
 595         if (!pde)
 596                 goto err1;
 597
 598         pde = proc_create("rt_cache", S_IRUGO,
 599                           net->proc_net_stat, &rt_cpu_seq_fops);
 600         if (!pde)
 601                 goto err2;
 602
 603 #ifdef CONFIG_NET_CLS_ROUTE
 604         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 605         if (!pde)
 606                 goto err3;
 607 #endif
 608         return 0;
 609
 610 #ifdef CONFIG_NET_CLS_ROUTE
 611 err3:
 612         remove_proc_entry("rt_cache", net->proc_net_stat);
 613 #endif
 614 err2:
 615         remove_proc_entry("rt_cache", net->proc_net);
 616 err1:
 617         return -ENOMEM;
 618 }
 619
 620 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 621 {
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623         remove_proc_entry("rt_cache", net->proc_net);
 624 #ifdef CONFIG_NET_CLS_ROUTE
 625         remove_proc_entry("rt_acct", net->proc_net);
 626 #endif
 627 }
 628
 629 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 630         .init = ip_rt_do_proc_init,
 631         .exit = ip_rt_do_proc_exit,
 632 };
 633
 634 static int __init ip_rt_proc_init(void)
 635 {
 636         return register_pernet_subsys(&ip_rt_proc_ops);
 637 }
 638
 639 #else
 640 static inline int ip_rt_proc_init(void)
 641 {
 642         return 0;
 643 }
 644 #endif /* CONFIG_PROC_FS */
 645
 646 static inline void rt_free(struct rtable *rt)
 647 {
 648         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 649 }
 650
 651 static inline void rt_drop(struct rtable *rt)
 652 {
 653         ip_rt_put(rt);
 654         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline int rt_fast_clean(struct rtable *rth)
 658 {
 659         /* Kill broadcast/multicast entries very aggresively, if they
 660            collide in hash table with more useful entries */
 661         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 662                 rth->fl.iif && rth->u.dst.rt_next;
 663 }
 664
 665 static inline int rt_valuable(struct rtable *rth)
 666 {
 667         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 668                 rth->u.dst.expires;
 669 }
 670
 671 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 672 {
 673         unsigned long age;
 674         int ret = 0;
 675
 676         if (atomic_read(&rth->u.dst.__refcnt))
 677                 goto out;
 678
 679         ret = 1;
 680         if (rth->u.dst.expires &&
 681             time_after_eq(jiffies, rth->u.dst.expires))
 682                 goto out;
 683
 684         age = jiffies - rth->u.dst.lastuse;
 685         ret = 0;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->u.dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (!rt->fl.iif ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct flowi *fl1,
 721                                         const struct flowi *fl2)
 722 {
 723         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 724                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 725                 (fl1->iif ^ fl2->iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 729 {
 730         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 731                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 732                 (fl1->mark ^ fl2->mark) |
 733                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
 734                 (fl1->oif ^ fl2->oif) |
 735                 (fl1->iif ^ fl2->iif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757         struct rtable * tail;
 758
 759         for (i = 0; i <= rt_hash_mask; i++) {
 760                 if (process_context && need_resched())
 761                         cond_resched();
 762                 rth = rt_hash_table[i].chain;
 763                 if (!rth)
 764                         continue;
 765
 766                 spin_lock_bh(rt_hash_lock_addr(i));
 767 #ifdef CONFIG_NET_NS
 768                 {
 769                 struct rtable ** prev, * p;
 770
 771                 rth = rt_hash_table[i].chain;
 772
 773                 /* defer releasing the head of the list after spin_unlock */
 774                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 775                         if (!rt_is_expired(tail))
 776                                 break;
 777                 if (rth != tail)
 778                         rt_hash_table[i].chain = tail;
 779
 780                 /* call rt_free on entries after the tail requiring flush */
 781                 prev = &rt_hash_table[i].chain;
 782                 for (p = *prev; p; p = next) {
 783                         next = p->u.dst.rt_next;
 784                         if (!rt_is_expired(p)) {
 785                                 prev = &p->u.dst.rt_next;
 786                         } else {
 787                                 *prev = next;
 788                                 rt_free(p);
 789                         }
 790                 }
 791                 }
 792 #else
 793                 rth = rt_hash_table[i].chain;
 794                 rt_hash_table[i].chain = NULL;
 795                 tail = NULL;
 796 #endif
 797                 spin_unlock_bh(rt_hash_lock_addr(i));
 798
 799                 for (; rth != tail; rth = next) {
 800                         next = rth->u.dst.rt_next;
 801                         rt_free(rth);
 802                 }
 803         }
 804 }
 805
 806 /*
 807  * While freeing expired entries, we compute average chain length
 808  * and standard deviation, using fixed-point arithmetic.
 809  * This to have an estimation of rt_chain_length_max
 810  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 811  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 812  */
 813
 814 #define FRACT_BITS 3
 815 #define ONE (1UL << FRACT_BITS)
 816
 817 /*
 818  * Given a hash chain and an item in this hash chain,
 819  * find if a previous entry has the same hash_inputs
 820  * (but differs on tos, mark or oif)
 821  * Returns 0 if an alias is found.
 822  * Returns ONE if rth has no alias before itself.
 823  */
 824 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 825 {
 826         const struct rtable *aux = head;
 827
 828         while (aux != rth) {
 829                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 830                         return 0;
 831                 aux = aux->u.dst.rt_next;
 832         }
 833         return ONE;
 834 }
 835
 836 static void rt_check_expire(void)
 837 {
 838         static unsigned int rover;
 839         unsigned int i = rover, goal;
 840         struct rtable *rth, **rthp;
 841         unsigned long samples = 0;
 842         unsigned long sum = 0, sum2 = 0;
 843         unsigned long delta;
 844         u64 mult;
 845
 846         delta = jiffies - expires_ljiffies;
 847         expires_ljiffies = jiffies;
 848         mult = ((u64)delta) << rt_hash_log;
 849         if (ip_rt_gc_timeout > 1)
 850                 do_div(mult, ip_rt_gc_timeout);
 851         goal = (unsigned int)mult;
 852         if (goal > rt_hash_mask)
 853                 goal = rt_hash_mask + 1;
 854         for (; goal > 0; goal--) {
 855                 unsigned long tmo = ip_rt_gc_timeout;
 856                 unsigned long length;
 857
 858                 i = (i + 1) & rt_hash_mask;
 859                 rthp = &rt_hash_table[i].chain;
 860
 861                 if (need_resched())
 862                         cond_resched();
 863
 864                 samples++;
 865
 866                 if (*rthp == NULL)
 867                         continue;
 868                 length = 0;
 869                 spin_lock_bh(rt_hash_lock_addr(i));
 870                 while ((rth = *rthp) != NULL) {
 871                         prefetch(rth->u.dst.rt_next);
 872                         if (rt_is_expired(rth)) {
 873                                 *rthp = rth->u.dst.rt_next;
 874                                 rt_free(rth);
 875                                 continue;
 876                         }
 877                         if (rth->u.dst.expires) {
 878                                 /* Entry is expired even if it is in use */
 879                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 880 nofree:
 881                                         tmo >>= 1;
 882                                         rthp = &rth->u.dst.rt_next;
 883                                         /*
 884                                          * We only count entries on
 885                                          * a chain with equal hash inputs once
 886                                          * so that entries for different QOS
 887                                          * levels, and other non-hash input
 888                                          * attributes don't unfairly skew
 889                                          * the length computation
 890                                          */
 891                                         length += has_noalias(rt_hash_table[i].chain, rth);
 892                                         continue;
 893                                 }
 894                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 895                                 goto nofree;
 896
 897                         /* Cleanup aged off entries. */
 898                         *rthp = rth->u.dst.rt_next;
 899                         rt_free(rth);
 900                 }
 901                 spin_unlock_bh(rt_hash_lock_addr(i));
 902                 sum += length;
 903                 sum2 += length*length;
 904         }
 905         if (samples) {
 906                 unsigned long avg = sum / samples;
 907                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 908                 rt_chain_length_max = max_t(unsigned long,
 909                                         ip_rt_gc_elasticity,
 910                                         (avg + 4*sd) >> FRACT_BITS);
 911         }
 912         rover = i;
 913 }
 914
 915 /*
 916  * rt_worker_func() is run in process context.
 917  * we call rt_check_expire() to scan part of the hash table
 918  */
 919 static void rt_worker_func(struct work_struct *work)
 920 {
 921         rt_check_expire();
 922         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 923 }
 924
 925 /*
 926  * Pertubation of rt_genid by a small quantity [1..256]
 927  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 928  * many times (2^24) without giving recent rt_genid.
 929  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 930  */
 931 static void rt_cache_invalidate(struct net *net)
 932 {
 933         unsigned char shuffle;
 934
 935         get_random_bytes(&shuffle, sizeof(shuffle));
 936         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 937 }
 938
 939 /*
 940  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 941  * delay >= 0 : invalidate & flush cache (can be long)
 942  */
 943 void rt_cache_flush(struct net *net, int delay)
 944 {
 945         rt_cache_invalidate(net);
 946         if (delay >= 0)
 947                 rt_do_flush(!in_softirq());
 948 }
 949
 950 /* Flush previous cache invalidated entries from the cache */
 951 void rt_cache_flush_batch(void)
 952 {
 953         rt_do_flush(!in_softirq());
 954 }
 955
 956 static void rt_emergency_hash_rebuild(struct net *net)
 957 {
 958         if (net_ratelimit())
 959                 printk(KERN_WARNING "Route hash chain too long!\n");
 960         rt_cache_invalidate(net);
 961 }
 962
 963 /*
 964    Short description of GC goals.
 965
 966    We want to build algorithm, which will keep routing cache
 967    at some equilibrium point, when number of aged off entries
 968    is kept approximately equal to newly generated ones.
 969
 970    Current expiration strength is variable "expire".
 971    We try to adjust it dynamically, so that if networking
 972    is idle expires is large enough to keep enough of warm entries,
 973    and when load increases it reduces to limit cache size.
 974  */
 975
 976 static int rt_garbage_collect(struct dst_ops *ops)
 977 {
 978         static unsigned long expire = RT_GC_TIMEOUT;
 979         static unsigned long last_gc;
 980         static int rover;
 981         static int equilibrium;
 982         struct rtable *rth, **rthp;
 983         unsigned long now = jiffies;
 984         int goal;
 985
 986         /*
 987          * Garbage collection is pretty expensive,
 988          * do not make it too frequently.
 989          */
 990
 991         RT_CACHE_STAT_INC(gc_total);
 992
 993         if (now - last_gc < ip_rt_gc_min_interval &&
 994             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 995                 RT_CACHE_STAT_INC(gc_ignored);
 996                 goto out;
 997         }
 998
 999         /* Calculate number of entries, which we want to expire now. */
1000         goal = atomic_read(&ipv4_dst_ops.entries) -
1001                 (ip_rt_gc_elasticity << rt_hash_log);
1002         if (goal <= 0) {
1003                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004                         equilibrium = ipv4_dst_ops.gc_thresh;
1005                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
1006                 if (goal > 0) {
1007                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
1009                 }
1010         } else {
1011                 /* We are in dangerous area. Try to reduce cache really
1012                  * aggressively.
1013                  */
1014                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1016         }
1017
1018         if (now - last_gc >= ip_rt_gc_min_interval)
1019                 last_gc = now;
1020
1021         if (goal <= 0) {
1022                 equilibrium += goal;
1023                 goto work_done;
1024         }
1025
1026         do {
1027                 int i, k;
1028
1029                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030                         unsigned long tmo = expire;
1031
1032                         k = (k + 1) & rt_hash_mask;
1033                         rthp = &rt_hash_table[k].chain;
1034                         spin_lock_bh(rt_hash_lock_addr(k));
1035                         while ((rth = *rthp) != NULL) {
1036                                 if (!rt_is_expired(rth) &&
1037                                         !rt_may_expire(rth, tmo, expire)) {
1038                                         tmo >>= 1;
1039                                         rthp = &rth->u.dst.rt_next;
1040                                         continue;
1041                                 }
1042                                 *rthp = rth->u.dst.rt_next;
1043                                 rt_free(rth);
1044                                 goal--;
1045                         }
1046                         spin_unlock_bh(rt_hash_lock_addr(k));
1047                         if (goal <= 0)
1048                                 break;
1049                 }
1050                 rover = k;
1051
1052                 if (goal <= 0)
1053                         goto work_done;
1054
1055                 /* Goal is not achieved. We stop process if:
1056
1057                    - if expire reduced to zero. Otherwise, expire is halfed.
1058                    - if table is not full.
1059                    - if we are called from interrupt.
1060                    - jiffies check is just fallback/debug loop breaker.
1061                      We will not spin here for long time in any case.
1062                  */
1063
1064                 RT_CACHE_STAT_INC(gc_goal_miss);
1065
1066                 if (expire == 0)
1067                         break;
1068
1069                 expire >>= 1;
1070 #if RT_CACHE_DEBUG >= 2
1071                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1072                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1073 #endif
1074
1075                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1080                 goto out;
1081         if (net_ratelimit())
1082                 printk(KERN_WARNING "dst cache overflow\n");
1083         RT_CACHE_STAT_INC(gc_dst_overflow);
1084         return 1;
1085
1086 work_done:
1087         expire += ip_rt_gc_min_interval;
1088         if (expire > ip_rt_gc_timeout ||
1089             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1090                 expire = ip_rt_gc_timeout;
1091 #if RT_CACHE_DEBUG >= 2
1092         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1093                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1094 #endif
1095 out:    return 0;
1096 }
1097
1098 /*
1099  * Returns number of entries in a hash chain that have different hash_inputs
1100  */
1101 static int slow_chain_length(const struct rtable *head)
1102 {
1103         int length = 0;
1104         const struct rtable *rth = head;
1105
1106         while (rth) {
1107                 length += has_noalias(head, rth);
1108                 rth = rth->u.dst.rt_next;
1109         }
1110         return length >> FRACT_BITS;
1111 }
1112
1113 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1114                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1115 {
1116         struct rtable   *rth, **rthp;
1117         unsigned long   now;
1118         struct rtable *cand, **candp;
1119         u32             min_score;
1120         int             chain_length;
1121         int attempts = !in_softirq();
1122
1123 restart:
1124         chain_length = 0;
1125         min_score = ~(u32)0;
1126         cand = NULL;
1127         candp = NULL;
1128         now = jiffies;
1129
1130         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1131                 /*
1132                  * If we're not caching, just tell the caller we
1133                  * were successful and don't touch the route.  The
1134                  * caller hold the sole reference to the cache entry, and
1135                  * it will be released when the caller is done with it.
1136                  * If we drop it here, the callers have no way to resolve routes
1137                  * when we're not caching.  Instead, just point *rp at rt, so
1138                  * the caller gets a single use out of the route
1139                  * Note that we do rt_free on this new route entry, so that
1140                  * once its refcount hits zero, we are still able to reap it
1141                  * (Thanks Alexey)
1142                  * Note also the rt_free uses call_rcu.  We don't actually
1143                  * need rcu protection here, this is just our path to get
1144                  * on the route gc list.
1145                  */
1146
1147                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1148                         int err = arp_bind_neighbour(&rt->u.dst);
1149                         if (err) {
1150                                 if (net_ratelimit())
1151                                         printk(KERN_WARNING
1152                                             "Neighbour table failure & not caching routes.\n");
1153                                 rt_drop(rt);
1154                                 return err;
1155                         }
1156                 }
1157
1158                 rt_free(rt);
1159                 goto skip_hashing;
1160         }
1161
1162         rthp = &rt_hash_table[hash].chain;
1163
1164         spin_lock_bh(rt_hash_lock_addr(hash));
1165         while ((rth = *rthp) != NULL) {
1166                 if (rt_is_expired(rth)) {
1167                         *rthp = rth->u.dst.rt_next;
1168                         rt_free(rth);
1169                         continue;
1170                 }
1171                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1172                         /* Put it first */
1173                         *rthp = rth->u.dst.rt_next;
1174                         /*
1175                          * Since lookup is lockfree, the deletion
1176                          * must be visible to another weakly ordered CPU before
1177                          * the insertion at the start of the hash chain.
1178                          */
1179                         rcu_assign_pointer(rth->u.dst.rt_next,
1180                                            rt_hash_table[hash].chain);
1181                         /*
1182                          * Since lookup is lockfree, the update writes
1183                          * must be ordered for consistency on SMP.
1184                          */
1185                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1186
1187                         dst_use(&rth->u.dst, now);
1188                         spin_unlock_bh(rt_hash_lock_addr(hash));
1189
1190                         rt_drop(rt);
1191                         if (rp)
1192                                 *rp = rth;
1193                         else
1194                                 skb_dst_set(skb, &rth->u.dst);
1195                         return 0;
1196                 }
1197
1198                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1199                         u32 score = rt_score(rth);
1200
1201                         if (score <= min_score) {
1202                                 cand = rth;
1203                                 candp = rthp;
1204                                 min_score = score;
1205                         }
1206                 }
1207
1208                 chain_length++;
1209
1210                 rthp = &rth->u.dst.rt_next;
1211         }
1212
1213         if (cand) {
1214                 /* ip_rt_gc_elasticity used to be average length of chain
1215                  * length, when exceeded gc becomes really aggressive.
1216                  *
1217                  * The second limit is less certain. At the moment it allows
1218                  * only 2 entries per bucket. We will see.
1219                  */
1220                 if (chain_length > ip_rt_gc_elasticity) {
1221                         *candp = cand->u.dst.rt_next;
1222                         rt_free(cand);
1223                 }
1224         } else {
1225                 if (chain_length > rt_chain_length_max &&
1226                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1227                         struct net *net = dev_net(rt->u.dst.dev);
1228                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1229                         if (!rt_caching(net)) {
1230                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1231                                         rt->u.dst.dev->name, num);
1232                         }
1233                         rt_emergency_hash_rebuild(net);
1234                         spin_unlock_bh(rt_hash_lock_addr(hash));
1235
1236                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1237                                         ifindex, rt_genid(net));
1238                         goto restart;
1239                 }
1240         }
1241
1242         /* Try to bind route to arp only if it is output
1243            route or unicast forwarding path.
1244          */
1245         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1246                 int err = arp_bind_neighbour(&rt->u.dst);
1247                 if (err) {
1248                         spin_unlock_bh(rt_hash_lock_addr(hash));
1249
1250                         if (err != -ENOBUFS) {
1251                                 rt_drop(rt);
1252                                 return err;
1253                         }
1254
1255                         /* Neighbour tables are full and nothing
1256                            can be released. Try to shrink route cache,
1257                            it is most likely it holds some neighbour records.
1258                          */
1259                         if (attempts-- > 0) {
1260                                 int saved_elasticity = ip_rt_gc_elasticity;
1261                                 int saved_int = ip_rt_gc_min_interval;
1262                                 ip_rt_gc_elasticity     = 1;
1263                                 ip_rt_gc_min_interval   = 0;
1264                                 rt_garbage_collect(&ipv4_dst_ops);
1265                                 ip_rt_gc_min_interval   = saved_int;
1266                                 ip_rt_gc_elasticity     = saved_elasticity;
1267                                 goto restart;
1268                         }
1269
1270                         if (net_ratelimit())
1271                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1272                         rt_drop(rt);
1273                         return -ENOBUFS;
1274                 }
1275         }
1276
1277         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1278
1279 #if RT_CACHE_DEBUG >= 2
1280         if (rt->u.dst.rt_next) {
1281                 struct rtable *trt;
1282                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1283                        hash, &rt->rt_dst);
1284                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1285                         printk(" . %pI4", &trt->rt_dst);
1286                 printk("\n");
1287         }
1288 #endif
1289         /*
1290          * Since lookup is lockfree, we must make sure
1291          * previous writes to rt are comitted to memory
1292          * before making rt visible to other CPUS.
1293          */
1294         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1295
1296         spin_unlock_bh(rt_hash_lock_addr(hash));
1297
1298 skip_hashing:
1299         if (rp)
1300                 *rp = rt;
1301         else
1302                 skb_dst_set(skb, &rt->u.dst);
1303         return 0;
1304 }
1305
1306 void rt_bind_peer(struct rtable *rt, int create)
1307 {
1308         static DEFINE_SPINLOCK(rt_peer_lock);
1309         struct inet_peer *peer;
1310
1311         peer = inet_getpeer(rt->rt_dst, create);
1312
1313         spin_lock_bh(&rt_peer_lock);
1314         if (rt->peer == NULL) {
1315                 rt->peer = peer;
1316                 peer = NULL;
1317         }
1318         spin_unlock_bh(&rt_peer_lock);
1319         if (peer)
1320                 inet_putpeer(peer);
1321 }
1322
1323 /*
1324  * Peer allocation may fail only in serious out-of-memory conditions.  However
1325  * we still can generate some output.
1326  * Random ID selection looks a bit dangerous because we have no chances to
1327  * select ID being unique in a reasonable period of time.
1328  * But broken packet identifier may be better than no packet at all.
1329  */
1330 static void ip_select_fb_ident(struct iphdr *iph)
1331 {
1332         static DEFINE_SPINLOCK(ip_fb_id_lock);
1333         static u32 ip_fallback_id;
1334         u32 salt;
1335
1336         spin_lock_bh(&ip_fb_id_lock);
1337         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1338         iph->id = htons(salt & 0xFFFF);
1339         ip_fallback_id = salt;
1340         spin_unlock_bh(&ip_fb_id_lock);
1341 }
1342
1343 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1344 {
1345         struct rtable *rt = (struct rtable *) dst;
1346
1347         if (rt) {
1348                 if (rt->peer == NULL)
1349                         rt_bind_peer(rt, 1);
1350
1351                 /* If peer is attached to destination, it is never detached,
1352                    so that we need not to grab a lock to dereference it.
1353                  */
1354                 if (rt->peer) {
1355                         iph->id = htons(inet_getid(rt->peer, more));
1356                         return;
1357                 }
1358         } else
1359                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1360                        __builtin_return_address(0));
1361
1362         ip_select_fb_ident(iph);
1363 }
1364
1365 static void rt_del(unsigned hash, struct rtable *rt)
1366 {
1367         struct rtable **rthp, *aux;
1368
1369         rthp = &rt_hash_table[hash].chain;
1370         spin_lock_bh(rt_hash_lock_addr(hash));
1371         ip_rt_put(rt);
1372         while ((aux = *rthp) != NULL) {
1373                 if (aux == rt || rt_is_expired(aux)) {
1374                         *rthp = aux->u.dst.rt_next;
1375                         rt_free(aux);
1376                         continue;
1377                 }
1378                 rthp = &aux->u.dst.rt_next;
1379         }
1380         spin_unlock_bh(rt_hash_lock_addr(hash));
1381 }
1382
1383 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1384                     __be32 saddr, struct net_device *dev)
1385 {
1386         int i, k;
1387         struct in_device *in_dev = in_dev_get(dev);
1388         struct rtable *rth, **rthp;
1389         __be32  skeys[2] = { saddr, 0 };
1390         int  ikeys[2] = { dev->ifindex, 0 };
1391         struct netevent_redirect netevent;
1392         struct net *net;
1393
1394         if (!in_dev)
1395                 return;
1396
1397         net = dev_net(dev);
1398         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1399             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1400             ipv4_is_zeronet(new_gw))
1401                 goto reject_redirect;
1402
1403         if (!rt_caching(net))
1404                 goto reject_redirect;
1405
1406         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1407                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1408                         goto reject_redirect;
1409                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1410                         goto reject_redirect;
1411         } else {
1412                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1413                         goto reject_redirect;
1414         }
1415
1416         for (i = 0; i < 2; i++) {
1417                 for (k = 0; k < 2; k++) {
1418                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1419                                                 rt_genid(net));
1420
1421                         rthp=&rt_hash_table[hash].chain;
1422
1423                         rcu_read_lock();
1424                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1425                                 struct rtable *rt;
1426
1427                                 if (rth->fl.fl4_dst != daddr ||
1428                                     rth->fl.fl4_src != skeys[i] ||
1429                                     rth->fl.oif != ikeys[k] ||
1430                                     rth->fl.iif != 0 ||
1431                                     rt_is_expired(rth) ||
1432                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1433                                         rthp = &rth->u.dst.rt_next;
1434                                         continue;
1435                                 }
1436
1437                                 if (rth->rt_dst != daddr ||
1438                                     rth->rt_src != saddr ||
1439                                     rth->u.dst.error ||
1440                                     rth->rt_gateway != old_gw ||
1441                                     rth->u.dst.dev != dev)
1442                                         break;
1443
1444                                 dst_hold(&rth->u.dst);
1445                                 rcu_read_unlock();
1446
1447                                 rt = dst_alloc(&ipv4_dst_ops);
1448                                 if (rt == NULL) {
1449                                         ip_rt_put(rth);
1450                                         in_dev_put(in_dev);
1451                                         return;
1452                                 }
1453
1454                                 /* Copy all the information. */
1455                                 *rt = *rth;
1456                                 rt->u.dst.__use         = 1;
1457                                 atomic_set(&rt->u.dst.__refcnt, 1);
1458                                 rt->u.dst.child         = NULL;
1459                                 if (rt->u.dst.dev)
1460                                         dev_hold(rt->u.dst.dev);
1461                                 if (rt->idev)
1462                                         in_dev_hold(rt->idev);
1463                                 rt->u.dst.obsolete      = -1;
1464                                 rt->u.dst.lastuse       = jiffies;
1465                                 rt->u.dst.path          = &rt->u.dst;
1466                                 rt->u.dst.neighbour     = NULL;
1467                                 rt->u.dst.hh            = NULL;
1468 #ifdef CONFIG_XFRM
1469                                 rt->u.dst.xfrm          = NULL;
1470 #endif
1471                                 rt->rt_genid            = rt_genid(net);
1472                                 rt->rt_flags            |= RTCF_REDIRECTED;
1473
1474                                 /* Gateway is different ... */
1475                                 rt->rt_gateway          = new_gw;
1476
1477                                 /* Redirect received -> path was valid */
1478                                 dst_confirm(&rth->u.dst);
1479
1480                                 if (rt->peer)
1481                                         atomic_inc(&rt->peer->refcnt);
1482
1483                                 if (arp_bind_neighbour(&rt->u.dst) ||
1484                                     !(rt->u.dst.neighbour->nud_state &
1485                                             NUD_VALID)) {
1486                                         if (rt->u.dst.neighbour)
1487                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1488                                         ip_rt_put(rth);
1489                                         rt_drop(rt);
1490                                         goto do_next;
1491                                 }
1492
1493                                 netevent.old = &rth->u.dst;
1494                                 netevent.new = &rt->u.dst;
1495                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1496                                                         &netevent);
1497
1498                                 rt_del(hash, rth);
1499                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1500                                         ip_rt_put(rt);
1501                                 goto do_next;
1502                         }
1503                         rcu_read_unlock();
1504                 do_next:
1505                         ;
1506                 }
1507         }
1508         in_dev_put(in_dev);
1509         return;
1510
1511 reject_redirect:
1512 #ifdef CONFIG_IP_ROUTE_VERBOSE
1513         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1514                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1515                         "  Advised path = %pI4 -> %pI4\n",
1516                        &old_gw, dev->name, &new_gw,
1517                        &saddr, &daddr);
1518 #endif
1519         in_dev_put(in_dev);
1520 }
1521
1522 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1523 {
1524         struct rtable *rt = (struct rtable *)dst;
1525         struct dst_entry *ret = dst;
1526
1527         if (rt) {
1528                 if (dst->obsolete > 0) {
1529                         ip_rt_put(rt);
1530                         ret = NULL;
1531                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1532                            (rt->u.dst.expires &&
1533                             time_after_eq(jiffies, rt->u.dst.expires))) {
1534                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1535                                                 rt->fl.oif,
1536                                                 rt_genid(dev_net(dst->dev)));
1537 #if RT_CACHE_DEBUG >= 1
1538                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1539                                 &rt->rt_dst, rt->fl.fl4_tos);
1540 #endif
1541                         rt_del(hash, rt);
1542                         ret = NULL;
1543                 }
1544         }
1545         return ret;
1546 }
1547
1548 /*
1549  * Algorithm:
1550  *      1. The first ip_rt_redirect_number redirects are sent
1551  *         with exponential backoff, then we stop sending them at all,
1552  *         assuming that the host ignores our redirects.
1553  *      2. If we did not see packets requiring redirects
1554  *         during ip_rt_redirect_silence, we assume that the host
1555  *         forgot redirected route and start to send redirects again.
1556  *
1557  * This algorithm is much cheaper and more intelligent than dumb load limiting
1558  * in icmp.c.
1559  *
1560  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1561  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1562  */
1563
1564 void ip_rt_send_redirect(struct sk_buff *skb)
1565 {
1566         struct rtable *rt = skb_rtable(skb);
1567         struct in_device *in_dev;
1568         int log_martians;
1569
1570         rcu_read_lock();
1571         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1572         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1573                 rcu_read_unlock();
1574                 return;
1575         }
1576         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1577         rcu_read_unlock();
1578
1579         /* No redirected packets during ip_rt_redirect_silence;
1580          * reset the algorithm.
1581          */
1582         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1583                 rt->u.dst.rate_tokens = 0;
1584
1585         /* Too many ignored redirects; do not send anything
1586          * set u.dst.rate_last to the last seen redirected packet.
1587          */
1588         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1589                 rt->u.dst.rate_last = jiffies;
1590                 return;
1591         }
1592
1593         /* Check for load limit; set rate_last to the latest sent
1594          * redirect.
1595          */
1596         if (rt->u.dst.rate_tokens == 0 ||
1597             time_after(jiffies,
1598                        (rt->u.dst.rate_last +
1599                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1600                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1601                 rt->u.dst.rate_last = jiffies;
1602                 ++rt->u.dst.rate_tokens;
1603 #ifdef CONFIG_IP_ROUTE_VERBOSE
1604                 if (log_martians &&
1605                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1606                     net_ratelimit())
1607                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1608                                 &rt->rt_src, rt->rt_iif,
1609                                 &rt->rt_dst, &rt->rt_gateway);
1610 #endif
1611         }
1612 }
1613
1614 static int ip_error(struct sk_buff *skb)
1615 {
1616         struct rtable *rt = skb_rtable(skb);
1617         unsigned long now;
1618         int code;
1619
1620         switch (rt->u.dst.error) {
1621                 case EINVAL:
1622                 default:
1623                         goto out;
1624                 case EHOSTUNREACH:
1625                         code = ICMP_HOST_UNREACH;
1626                         break;
1627                 case ENETUNREACH:
1628                         code = ICMP_NET_UNREACH;
1629                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1630                                         IPSTATS_MIB_INNOROUTES);
1631                         break;
1632                 case EACCES:
1633                         code = ICMP_PKT_FILTERED;
1634                         break;
1635         }
1636
1637         now = jiffies;
1638         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1639         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1640                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1641         rt->u.dst.rate_last = now;
1642         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1643                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1644                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1645         }
1646
1647 out:    kfree_skb(skb);
1648         return 0;
1649 }
1650
1651 /*
1652  *      The last two values are not from the RFC but
1653  *      are needed for AMPRnet AX.25 paths.
1654  */
1655
1656 static const unsigned short mtu_plateau[] =
1657 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1658
1659 static inline unsigned short guess_mtu(unsigned short old_mtu)
1660 {
1661         int i;
1662
1663         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1664                 if (old_mtu > mtu_plateau[i])
1665                         return mtu_plateau[i];
1666         return 68;
1667 }
1668
1669 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1670                                  unsigned short new_mtu,
1671                                  struct net_device *dev)
1672 {
1673         int i, k;
1674         unsigned short old_mtu = ntohs(iph->tot_len);
1675         struct rtable *rth;
1676         int  ikeys[2] = { dev->ifindex, 0 };
1677         __be32  skeys[2] = { iph->saddr, 0, };
1678         __be32  daddr = iph->daddr;
1679         unsigned short est_mtu = 0;
1680
1681         for (k = 0; k < 2; k++) {
1682                 for (i = 0; i < 2; i++) {
1683                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1684                                                 rt_genid(net));
1685
1686                         rcu_read_lock();
1687                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1688                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1689                                 unsigned short mtu = new_mtu;
1690
1691                                 if (rth->fl.fl4_dst != daddr ||
1692                                     rth->fl.fl4_src != skeys[i] ||
1693                                     rth->rt_dst != daddr ||
1694                                     rth->rt_src != iph->saddr ||
1695                                     rth->fl.oif != ikeys[k] ||
1696                                     rth->fl.iif != 0 ||
1697                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1698                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1699                                     rt_is_expired(rth))
1700                                         continue;
1701
1702                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1703
1704                                         /* BSD 4.2 compatibility hack :-( */
1705                                         if (mtu == 0 &&
1706                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1707                                             old_mtu >= 68 + (iph->ihl << 2))
1708                                                 old_mtu -= iph->ihl << 2;
1709
1710                                         mtu = guess_mtu(old_mtu);
1711                                 }
1712                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1713                                         if (mtu < dst_mtu(&rth->u.dst)) {
1714                                                 dst_confirm(&rth->u.dst);
1715                                                 if (mtu < ip_rt_min_pmtu) {
1716                                                         mtu = ip_rt_min_pmtu;
1717                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1718                                                                 (1 << RTAX_MTU);
1719                                                 }
1720                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1721                                                 dst_set_expires(&rth->u.dst,
1722                                                         ip_rt_mtu_expires);
1723                                         }
1724                                         est_mtu = mtu;
1725                                 }
1726                         }
1727                         rcu_read_unlock();
1728                 }
1729         }
1730         return est_mtu ? : new_mtu;
1731 }
1732
1733 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1734 {
1735         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1736             !(dst_metric_locked(dst, RTAX_MTU))) {
1737                 if (mtu < ip_rt_min_pmtu) {
1738                         mtu = ip_rt_min_pmtu;
1739                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1740                 }
1741                 dst->metrics[RTAX_MTU-1] = mtu;
1742                 dst_set_expires(dst, ip_rt_mtu_expires);
1743                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1744         }
1745 }
1746
1747 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1748 {
1749         if (rt_is_expired((struct rtable *)dst))
1750                 return NULL;
1751         return dst;
1752 }
1753
1754 static void ipv4_dst_destroy(struct dst_entry *dst)
1755 {
1756         struct rtable *rt = (struct rtable *) dst;
1757         struct inet_peer *peer = rt->peer;
1758         struct in_device *idev = rt->idev;
1759
1760         if (peer) {
1761                 rt->peer = NULL;
1762                 inet_putpeer(peer);
1763         }
1764
1765         if (idev) {
1766                 rt->idev = NULL;
1767                 in_dev_put(idev);
1768         }
1769 }
1770
1771 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1772                             int how)
1773 {
1774         struct rtable *rt = (struct rtable *) dst;
1775         struct in_device *idev = rt->idev;
1776         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1777                 struct in_device *loopback_idev =
1778                         in_dev_get(dev_net(dev)->loopback_dev);
1779                 if (loopback_idev) {
1780                         rt->idev = loopback_idev;
1781                         in_dev_put(idev);
1782                 }
1783         }
1784 }
1785
1786 static void ipv4_link_failure(struct sk_buff *skb)
1787 {
1788         struct rtable *rt;
1789
1790         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1791
1792         rt = skb_rtable(skb);
1793         if (rt)
1794                 dst_set_expires(&rt->u.dst, 0);
1795 }
1796
1797 static int ip_rt_bug(struct sk_buff *skb)
1798 {
1799         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1800                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1801                 skb->dev ? skb->dev->name : "?");
1802         kfree_skb(skb);
1803         return 0;
1804 }
1805
1806 /*
1807    We do not cache source address of outgoing interface,
1808    because it is used only by IP RR, TS and SRR options,
1809    so that it out of fast path.
1810
1811    BTW remember: "addr" is allowed to be not aligned
1812    in IP options!
1813  */
1814
1815 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1816 {
1817         __be32 src;
1818         struct fib_result res;
1819
1820         if (rt->fl.iif == 0)
1821                 src = rt->rt_src;
1822         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1823                 src = FIB_RES_PREFSRC(res);
1824                 fib_res_put(&res);
1825         } else
1826                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1827                                         RT_SCOPE_UNIVERSE);
1828         memcpy(addr, &src, 4);
1829 }
1830
1831 #ifdef CONFIG_NET_CLS_ROUTE
1832 static void set_class_tag(struct rtable *rt, u32 tag)
1833 {
1834         if (!(rt->u.dst.tclassid & 0xFFFF))
1835                 rt->u.dst.tclassid |= tag & 0xFFFF;
1836         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1837                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1838 }
1839 #endif
1840
1841 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1842 {
1843         struct fib_info *fi = res->fi;
1844
1845         if (fi) {
1846                 if (FIB_RES_GW(*res) &&
1847                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1848                         rt->rt_gateway = FIB_RES_GW(*res);
1849                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1850                        sizeof(rt->u.dst.metrics));
1851                 if (fi->fib_mtu == 0) {
1852                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1853                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1854                             rt->rt_gateway != rt->rt_dst &&
1855                             rt->u.dst.dev->mtu > 576)
1856                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1857                 }
1858 #ifdef CONFIG_NET_CLS_ROUTE
1859                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1860 #endif
1861         } else
1862                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1863
1864         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1865                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1866         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1867                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1868         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1869                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1870                                        ip_rt_min_advmss);
1871         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1872                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1873
1874 #ifdef CONFIG_NET_CLS_ROUTE
1875 #ifdef CONFIG_IP_MULTIPLE_TABLES
1876         set_class_tag(rt, fib_rules_tclass(res));
1877 #endif
1878         set_class_tag(rt, itag);
1879 #endif
1880         rt->rt_type = res->type;
1881 }
1882
1883 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1884                                 u8 tos, struct net_device *dev, int our)
1885 {
1886         unsigned hash;
1887         struct rtable *rth;
1888         __be32 spec_dst;
1889         struct in_device *in_dev = in_dev_get(dev);
1890         u32 itag = 0;
1891
1892         /* Primary sanity checks. */
1893
1894         if (in_dev == NULL)
1895                 return -EINVAL;
1896
1897         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1898             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1899                 goto e_inval;
1900
1901         if (ipv4_is_zeronet(saddr)) {
1902                 if (!ipv4_is_local_multicast(daddr))
1903                         goto e_inval;
1904                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1905         } else if (fib_validate_source(saddr, 0, tos, 0,
1906                                         dev, &spec_dst, &itag, 0) < 0)
1907                 goto e_inval;
1908
1909         rth = dst_alloc(&ipv4_dst_ops);
1910         if (!rth)
1911                 goto e_nobufs;
1912
1913         rth->u.dst.output = ip_rt_bug;
1914         rth->u.dst.obsolete = -1;
1915
1916         atomic_set(&rth->u.dst.__refcnt, 1);
1917         rth->u.dst.flags= DST_HOST;
1918         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1919                 rth->u.dst.flags |= DST_NOPOLICY;
1920         rth->fl.fl4_dst = daddr;
1921         rth->rt_dst     = daddr;
1922         rth->fl.fl4_tos = tos;
1923         rth->fl.mark    = skb->mark;
1924         rth->fl.fl4_src = saddr;
1925         rth->rt_src     = saddr;
1926 #ifdef CONFIG_NET_CLS_ROUTE
1927         rth->u.dst.tclassid = itag;
1928 #endif
1929         rth->rt_iif     =
1930         rth->fl.iif     = dev->ifindex;
1931         rth->u.dst.dev  = init_net.loopback_dev;
1932         dev_hold(rth->u.dst.dev);
1933         rth->idev       = in_dev_get(rth->u.dst.dev);
1934         rth->fl.oif     = 0;
1935         rth->rt_gateway = daddr;
1936         rth->rt_spec_dst= spec_dst;
1937         rth->rt_genid   = rt_genid(dev_net(dev));
1938         rth->rt_flags   = RTCF_MULTICAST;
1939         rth->rt_type    = RTN_MULTICAST;
1940         if (our) {
1941                 rth->u.dst.input= ip_local_deliver;
1942                 rth->rt_flags |= RTCF_LOCAL;
1943         }
1944
1945 #ifdef CONFIG_IP_MROUTE
1946         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1947                 rth->u.dst.input = ip_mr_input;
1948 #endif
1949         RT_CACHE_STAT_INC(in_slow_mc);
1950
1951         in_dev_put(in_dev);
1952         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1953         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1954
1955 e_nobufs:
1956         in_dev_put(in_dev);
1957         return -ENOBUFS;
1958
1959 e_inval:
1960         in_dev_put(in_dev);
1961         return -EINVAL;
1962 }
1963
1964
1965 static void ip_handle_martian_source(struct net_device *dev,
1966                                      struct in_device *in_dev,
1967                                      struct sk_buff *skb,
1968                                      __be32 daddr,
1969                                      __be32 saddr)
1970 {
1971         RT_CACHE_STAT_INC(in_martian_src);
1972 #ifdef CONFIG_IP_ROUTE_VERBOSE
1973         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1974                 /*
1975                  *      RFC1812 recommendation, if source is martian,
1976                  *      the only hint is MAC header.
1977                  */
1978                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1979                         &daddr, &saddr, dev->name);
1980                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1981                         int i;
1982                         const unsigned char *p = skb_mac_header(skb);
1983                         printk(KERN_WARNING "ll header: ");
1984                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1985                                 printk("%02x", *p);
1986                                 if (i < (dev->hard_header_len - 1))
1987                                         printk(":");
1988                         }
1989                         printk("\n");
1990                 }
1991         }
1992 #endif
1993 }
1994
1995 static int __mkroute_input(struct sk_buff *skb,
1996                            struct fib_result *res,
1997                            struct in_device *in_dev,
1998                            __be32 daddr, __be32 saddr, u32 tos,
1999                            struct rtable **result)
2000 {
2001
2002         struct rtable *rth;
2003         int err;
2004         struct in_device *out_dev;
2005         unsigned flags = 0;
2006         __be32 spec_dst;
2007         u32 itag;
2008
2009         /* get a working reference to the output device */
2010         out_dev = in_dev_get(FIB_RES_DEV(*res));
2011         if (out_dev == NULL) {
2012                 if (net_ratelimit())
2013                         printk(KERN_CRIT "Bug in ip_route_input" \
2014                                "_slow(). Please, report\n");
2015                 return -EINVAL;
2016         }
2017
2018
2019         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2020                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2021         if (err < 0) {
2022                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2023                                          saddr);
2024
2025                 err = -EINVAL;
2026                 goto cleanup;
2027         }
2028
2029         if (err)
2030                 flags |= RTCF_DIRECTSRC;
2031
2032         if (out_dev == in_dev && err &&
2033             (IN_DEV_SHARED_MEDIA(out_dev) ||
2034              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2035                 flags |= RTCF_DOREDIRECT;
2036
2037         if (skb->protocol != htons(ETH_P_IP)) {
2038                 /* Not IP (i.e. ARP). Do not create route, if it is
2039                  * invalid for proxy arp. DNAT routes are always valid.
2040                  *
2041                  * Proxy arp feature have been extended to allow, ARP
2042                  * replies back to the same interface, to support
2043                  * Private VLAN switch technologies. See arp.c.
2044                  */
2045                 if (out_dev == in_dev &&
2046                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2047                         err = -EINVAL;
2048                         goto cleanup;
2049                 }
2050         }
2051
2052
2053         rth = dst_alloc(&ipv4_dst_ops);
2054         if (!rth) {
2055                 err = -ENOBUFS;
2056                 goto cleanup;
2057         }
2058
2059         atomic_set(&rth->u.dst.__refcnt, 1);
2060         rth->u.dst.flags= DST_HOST;
2061         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2062                 rth->u.dst.flags |= DST_NOPOLICY;
2063         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2064                 rth->u.dst.flags |= DST_NOXFRM;
2065         rth->fl.fl4_dst = daddr;
2066         rth->rt_dst     = daddr;
2067         rth->fl.fl4_tos = tos;
2068         rth->fl.mark    = skb->mark;
2069         rth->fl.fl4_src = saddr;
2070         rth->rt_src     = saddr;
2071         rth->rt_gateway = daddr;
2072         rth->rt_iif     =
2073                 rth->fl.iif     = in_dev->dev->ifindex;
2074         rth->u.dst.dev  = (out_dev)->dev;
2075         dev_hold(rth->u.dst.dev);
2076         rth->idev       = in_dev_get(rth->u.dst.dev);
2077         rth->fl.oif     = 0;
2078         rth->rt_spec_dst= spec_dst;
2079
2080         rth->u.dst.obsolete = -1;
2081         rth->u.dst.input = ip_forward;
2082         rth->u.dst.output = ip_output;
2083         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2084
2085         rt_set_nexthop(rth, res, itag);
2086
2087         rth->rt_flags = flags;
2088
2089         *result = rth;
2090         err = 0;
2091  cleanup:
2092         /* release the working reference to the output device */
2093         in_dev_put(out_dev);
2094         return err;
2095 }
2096
2097 static int ip_mkroute_input(struct sk_buff *skb,
2098                             struct fib_result *res,
2099                             const struct flowi *fl,
2100                             struct in_device *in_dev,
2101                             __be32 daddr, __be32 saddr, u32 tos)
2102 {
2103         struct rtable* rth = NULL;
2104         int err;
2105         unsigned hash;
2106
2107 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2108         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2109                 fib_select_multipath(fl, res);
2110 #endif
2111
2112         /* create a routing cache entry */
2113         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2114         if (err)
2115                 return err;
2116
2117         /* put it into the cache */
2118         hash = rt_hash(daddr, saddr, fl->iif,
2119                        rt_genid(dev_net(rth->u.dst.dev)));
2120         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2121 }
2122
2123 /*
2124  *      NOTE. We drop all the packets that has local source
2125  *      addresses, because every properly looped back packet
2126  *      must have correct destination already attached by output routine.
2127  *
2128  *      Such approach solves two big problems:
2129  *      1. Not simplex devices are handled properly.
2130  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2131  */
2132
2133 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2134                                u8 tos, struct net_device *dev)
2135 {
2136         struct fib_result res;
2137         struct in_device *in_dev = in_dev_get(dev);
2138         struct flowi fl = { .nl_u = { .ip4_u =
2139                                       { .daddr = daddr,
2140                                         .saddr = saddr,
2141                                         .tos = tos,
2142                                         .scope = RT_SCOPE_UNIVERSE,
2143                                       } },
2144                             .mark = skb->mark,
2145                             .iif = dev->ifindex };
2146         unsigned        flags = 0;
2147         u32             itag = 0;
2148         struct rtable * rth;
2149         unsigned        hash;
2150         __be32          spec_dst;
2151         int             err = -EINVAL;
2152         int             free_res = 0;
2153         struct net    * net = dev_net(dev);
2154
2155         /* IP on this device is disabled. */
2156
2157         if (!in_dev)
2158                 goto out;
2159
2160         /* Check for the most weird martians, which can be not detected
2161            by fib_lookup.
2162          */
2163
2164         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2165             ipv4_is_loopback(saddr))
2166                 goto martian_source;
2167
2168         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2169                 goto brd_input;
2170
2171         /* Accept zero addresses only to limited broadcast;
2172          * I even do not know to fix it or not. Waiting for complains :-)
2173          */
2174         if (ipv4_is_zeronet(saddr))
2175                 goto martian_source;
2176
2177         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2178             ipv4_is_loopback(daddr))
2179                 goto martian_destination;
2180
2181         /*
2182          *      Now we are ready to route packet.
2183          */
2184         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2185                 if (!IN_DEV_FORWARD(in_dev))
2186                         goto e_hostunreach;
2187                 goto no_route;
2188         }
2189         free_res = 1;
2190
2191         RT_CACHE_STAT_INC(in_slow_tot);
2192
2193         if (res.type == RTN_BROADCAST)
2194                 goto brd_input;
2195
2196         if (res.type == RTN_LOCAL) {
2197                 int result;
2198                 result = fib_validate_source(saddr, daddr, tos,
2199                                              net->loopback_dev->ifindex,
2200                                              dev, &spec_dst, &itag, skb->mark);
2201                 if (result < 0)
2202                         goto martian_source;
2203                 if (result)
2204                         flags |= RTCF_DIRECTSRC;
2205                 spec_dst = daddr;
2206                 goto local_input;
2207         }
2208
2209         if (!IN_DEV_FORWARD(in_dev))
2210                 goto e_hostunreach;
2211         if (res.type != RTN_UNICAST)
2212                 goto martian_destination;
2213
2214         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2215 done:
2216         in_dev_put(in_dev);
2217         if (free_res)
2218                 fib_res_put(&res);
2219 out:    return err;
2220
2221 brd_input:
2222         if (skb->protocol != htons(ETH_P_IP))
2223                 goto e_inval;
2224
2225         if (ipv4_is_zeronet(saddr))
2226                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2227         else {
2228                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2229                                           &itag, skb->mark);
2230                 if (err < 0)
2231                         goto martian_source;
2232                 if (err)
2233                         flags |= RTCF_DIRECTSRC;
2234         }
2235         flags |= RTCF_BROADCAST;
2236         res.type = RTN_BROADCAST;
2237         RT_CACHE_STAT_INC(in_brd);
2238
2239 local_input:
2240         rth = dst_alloc(&ipv4_dst_ops);
2241         if (!rth)
2242                 goto e_nobufs;
2243
2244         rth->u.dst.output= ip_rt_bug;
2245         rth->u.dst.obsolete = -1;
2246         rth->rt_genid = rt_genid(net);
2247
2248         atomic_set(&rth->u.dst.__refcnt, 1);
2249         rth->u.dst.flags= DST_HOST;
2250         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2251                 rth->u.dst.flags |= DST_NOPOLICY;
2252         rth->fl.fl4_dst = daddr;
2253         rth->rt_dst     = daddr;
2254         rth->fl.fl4_tos = tos;
2255         rth->fl.mark    = skb->mark;
2256         rth->fl.fl4_src = saddr;
2257         rth->rt_src     = saddr;
2258 #ifdef CONFIG_NET_CLS_ROUTE
2259         rth->u.dst.tclassid = itag;
2260 #endif
2261         rth->rt_iif     =
2262         rth->fl.iif     = dev->ifindex;
2263         rth->u.dst.dev  = net->loopback_dev;
2264         dev_hold(rth->u.dst.dev);
2265         rth->idev       = in_dev_get(rth->u.dst.dev);
2266         rth->rt_gateway = daddr;
2267         rth->rt_spec_dst= spec_dst;
2268         rth->u.dst.input= ip_local_deliver;
2269         rth->rt_flags   = flags|RTCF_LOCAL;
2270         if (res.type == RTN_UNREACHABLE) {
2271                 rth->u.dst.input= ip_error;
2272                 rth->u.dst.error= -err;
2273                 rth->rt_flags   &= ~RTCF_LOCAL;
2274         }
2275         rth->rt_type    = res.type;
2276         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2277         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2278         goto done;
2279
2280 no_route:
2281         RT_CACHE_STAT_INC(in_no_route);
2282         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2283         res.type = RTN_UNREACHABLE;
2284         if (err == -ESRCH)
2285                 err = -ENETUNREACH;
2286         goto local_input;
2287
2288         /*
2289          *      Do not cache martian addresses: they should be logged (RFC1812)
2290          */
2291 martian_destination:
2292         RT_CACHE_STAT_INC(in_martian_dst);
2293 #ifdef CONFIG_IP_ROUTE_VERBOSE
2294         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2295                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2296                         &daddr, &saddr, dev->name);
2297 #endif
2298
2299 e_hostunreach:
2300         err = -EHOSTUNREACH;
2301         goto done;
2302
2303 e_inval:
2304         err = -EINVAL;
2305         goto done;
2306
2307 e_nobufs:
2308         err = -ENOBUFS;
2309         goto done;
2310
2311 martian_source:
2312         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2313         goto e_inval;
2314 }
2315
2316 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317                            u8 tos, struct net_device *dev, bool noref)
2318 {
2319         struct rtable * rth;
2320         unsigned        hash;
2321         int iif = dev->ifindex;
2322         struct net *net;
2323
2324         net = dev_net(dev);
2325
2326         if (!rt_caching(net))
2327                 goto skip_cache;
2328
2329         tos &= IPTOS_RT_MASK;
2330         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2331
2332         rcu_read_lock();
2333         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2334              rth = rcu_dereference(rth->u.dst.rt_next)) {
2335                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2336                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2337                      (rth->fl.iif ^ iif) |
2338                      rth->fl.oif |
2339                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2340                     rth->fl.mark == skb->mark &&
2341                     net_eq(dev_net(rth->u.dst.dev), net) &&
2342                     !rt_is_expired(rth)) {
2343                         if (noref) {
2344                                 dst_use_noref(&rth->u.dst, jiffies);
2345                                 skb_dst_set_noref(skb, &rth->u.dst);
2346                         } else {
2347                                 dst_use(&rth->u.dst, jiffies);
2348                                 skb_dst_set(skb, &rth->u.dst);
2349                         }
2350                         RT_CACHE_STAT_INC(in_hit);
2351                         rcu_read_unlock();
2352                         return 0;
2353                 }
2354                 RT_CACHE_STAT_INC(in_hlist_search);
2355         }
2356         rcu_read_unlock();
2357
2358 skip_cache:
2359         /* Multicast recognition logic is moved from route cache to here.
2360            The problem was that too many Ethernet cards have broken/missing
2361            hardware multicast filters :-( As result the host on multicasting
2362            network acquires a lot of useless route cache entries, sort of
2363            SDR messages from all the world. Now we try to get rid of them.
2364            Really, provided software IP multicast filter is organized
2365            reasonably (at least, hashed), it does not result in a slowdown
2366            comparing with route cache reject entries.
2367            Note, that multicast routers are not affected, because
2368            route cache entry is created eventually.
2369          */
2370         if (ipv4_is_multicast(daddr)) {
2371                 struct in_device *in_dev;
2372
2373                 rcu_read_lock();
2374                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2375                         int our = ip_check_mc(in_dev, daddr, saddr,
2376                                 ip_hdr(skb)->protocol);
2377                         if (our
2378 #ifdef CONFIG_IP_MROUTE
2379                                 ||
2380                             (!ipv4_is_local_multicast(daddr) &&
2381                              IN_DEV_MFORWARD(in_dev))
2382 #endif
2383                            ) {
2384                                 rcu_read_unlock();
2385                                 return ip_route_input_mc(skb, daddr, saddr,
2386                                                          tos, dev, our);
2387                         }
2388                 }
2389                 rcu_read_unlock();
2390                 return -EINVAL;
2391         }
2392         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2393 }
2394 EXPORT_SYMBOL(ip_route_input_common);
2395
2396 static int __mkroute_output(struct rtable **result,
2397                             struct fib_result *res,
2398                             const struct flowi *fl,
2399                             const struct flowi *oldflp,
2400                             struct net_device *dev_out,
2401                             unsigned flags)
2402 {
2403         struct rtable *rth;
2404         struct in_device *in_dev;
2405         u32 tos = RT_FL_TOS(oldflp);
2406         int err = 0;
2407
2408         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2409                 return -EINVAL;
2410
2411         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2412                 res->type = RTN_BROADCAST;
2413         else if (ipv4_is_multicast(fl->fl4_dst))
2414                 res->type = RTN_MULTICAST;
2415         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2416                 return -EINVAL;
2417
2418         if (dev_out->flags & IFF_LOOPBACK)
2419                 flags |= RTCF_LOCAL;
2420
2421         /* get work reference to inet device */
2422         in_dev = in_dev_get(dev_out);
2423         if (!in_dev)
2424                 return -EINVAL;
2425
2426         if (res->type == RTN_BROADCAST) {
2427                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2428                 if (res->fi) {
2429                         fib_info_put(res->fi);
2430                         res->fi = NULL;
2431                 }
2432         } else if (res->type == RTN_MULTICAST) {
2433                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2434                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2435                                  oldflp->proto))
2436                         flags &= ~RTCF_LOCAL;
2437                 /* If multicast route do not exist use
2438                    default one, but do not gateway in this case.
2439                    Yes, it is hack.
2440                  */
2441                 if (res->fi && res->prefixlen < 4) {
2442                         fib_info_put(res->fi);
2443                         res->fi = NULL;
2444                 }
2445         }
2446
2447
2448         rth = dst_alloc(&ipv4_dst_ops);
2449         if (!rth) {
2450                 err = -ENOBUFS;
2451                 goto cleanup;
2452         }
2453
2454         atomic_set(&rth->u.dst.__refcnt, 1);
2455         rth->u.dst.flags= DST_HOST;
2456         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2457                 rth->u.dst.flags |= DST_NOXFRM;
2458         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2459                 rth->u.dst.flags |= DST_NOPOLICY;
2460
2461         rth->fl.fl4_dst = oldflp->fl4_dst;
2462         rth->fl.fl4_tos = tos;
2463         rth->fl.fl4_src = oldflp->fl4_src;
2464         rth->fl.oif     = oldflp->oif;
2465         rth->fl.mark    = oldflp->mark;
2466         rth->rt_dst     = fl->fl4_dst;
2467         rth->rt_src     = fl->fl4_src;
2468         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2469         /* get references to the devices that are to be hold by the routing
2470            cache entry */
2471         rth->u.dst.dev  = dev_out;
2472         dev_hold(dev_out);
2473         rth->idev       = in_dev_get(dev_out);
2474         rth->rt_gateway = fl->fl4_dst;
2475         rth->rt_spec_dst= fl->fl4_src;
2476
2477         rth->u.dst.output=ip_output;
2478         rth->u.dst.obsolete = -1;
2479         rth->rt_genid = rt_genid(dev_net(dev_out));
2480
2481         RT_CACHE_STAT_INC(out_slow_tot);
2482
2483         if (flags & RTCF_LOCAL) {
2484                 rth->u.dst.input = ip_local_deliver;
2485                 rth->rt_spec_dst = fl->fl4_dst;
2486         }
2487         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2488                 rth->rt_spec_dst = fl->fl4_src;
2489                 if (flags & RTCF_LOCAL &&
2490                     !(dev_out->flags & IFF_LOOPBACK)) {
2491                         rth->u.dst.output = ip_mc_output;
2492                         RT_CACHE_STAT_INC(out_slow_mc);
2493                 }
2494 #ifdef CONFIG_IP_MROUTE
2495                 if (res->type == RTN_MULTICAST) {
2496                         if (IN_DEV_MFORWARD(in_dev) &&
2497                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2498                                 rth->u.dst.input = ip_mr_input;
2499                                 rth->u.dst.output = ip_mc_output;
2500                         }
2501                 }
2502 #endif
2503         }
2504
2505         rt_set_nexthop(rth, res, 0);
2506
2507         rth->rt_flags = flags;
2508
2509         *result = rth;
2510  cleanup:
2511         /* release work reference to inet device */
2512         in_dev_put(in_dev);
2513
2514         return err;
2515 }
2516
2517 static int ip_mkroute_output(struct rtable **rp,
2518                              struct fib_result *res,
2519                              const struct flowi *fl,
2520                              const struct flowi *oldflp,
2521                              struct net_device *dev_out,
2522                              unsigned flags)
2523 {
2524         struct rtable *rth = NULL;
2525         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2526         unsigned hash;
2527         if (err == 0) {
2528                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2529                                rt_genid(dev_net(dev_out)));
2530                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2531         }
2532
2533         return err;
2534 }
2535
2536 /*
2537  * Major route resolver routine.
2538  */
2539
2540 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2541                                 const struct flowi *oldflp)
2542 {
2543         u32 tos = RT_FL_TOS(oldflp);
2544         struct flowi fl = { .nl_u = { .ip4_u =
2545                                       { .daddr = oldflp->fl4_dst,
2546                                         .saddr = oldflp->fl4_src,
2547                                         .tos = tos & IPTOS_RT_MASK,
2548                                         .scope = ((tos & RTO_ONLINK) ?
2549                                                   RT_SCOPE_LINK :
2550                                                   RT_SCOPE_UNIVERSE),
2551                                       } },
2552                             .mark = oldflp->mark,
2553                             .iif = net->loopback_dev->ifindex,
2554                             .oif = oldflp->oif };
2555         struct fib_result res;
2556         unsigned flags = 0;
2557         struct net_device *dev_out = NULL;
2558         int free_res = 0;
2559         int err;
2560
2561
2562         res.fi          = NULL;
2563 #ifdef CONFIG_IP_MULTIPLE_TABLES
2564         res.r           = NULL;
2565 #endif
2566
2567         if (oldflp->fl4_src) {
2568                 err = -EINVAL;
2569                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2570                     ipv4_is_lbcast(oldflp->fl4_src) ||
2571                     ipv4_is_zeronet(oldflp->fl4_src))
2572                         goto out;
2573
2574                 /* I removed check for oif == dev_out->oif here.
2575                    It was wrong for two reasons:
2576                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2577                       is assigned to multiple interfaces.
2578                    2. Moreover, we are allowed to send packets with saddr
2579                       of another iface. --ANK
2580                  */
2581
2582                 if (oldflp->oif == 0 &&
2583                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2584                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2585                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2586                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2587                         if (dev_out == NULL)
2588                                 goto out;
2589
2590                         /* Special hack: user can direct multicasts
2591                            and limited broadcast via necessary interface
2592                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2593                            This hack is not just for fun, it allows
2594                            vic,vat and friends to work.
2595                            They bind socket to loopback, set ttl to zero
2596                            and expect that it will work.
2597                            From the viewpoint of routing cache they are broken,
2598                            because we are not allowed to build multicast path
2599                            with loopback source addr (look, routing cache
2600                            cannot know, that ttl is zero, so that packet
2601                            will not leave this host and route is valid).
2602                            Luckily, this hack is good workaround.
2603                          */
2604
2605                         fl.oif = dev_out->ifindex;
2606                         goto make_route;
2607                 }
2608
2609                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2610                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2611                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2612                         if (dev_out == NULL)
2613                                 goto out;
2614                         dev_put(dev_out);
2615                         dev_out = NULL;
2616                 }
2617         }
2618
2619
2620         if (oldflp->oif) {
2621                 dev_out = dev_get_by_index(net, oldflp->oif);
2622                 err = -ENODEV;
2623                 if (dev_out == NULL)
2624                         goto out;
2625
2626                 /* RACE: Check return value of inet_select_addr instead. */
2627                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2628                         dev_put(dev_out);
2629                         goto out;       /* Wrong error code */
2630                 }
2631
2632                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2633                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2634                         if (!fl.fl4_src)
2635                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2636                                                               RT_SCOPE_LINK);
2637                         goto make_route;
2638                 }
2639                 if (!fl.fl4_src) {
2640                         if (ipv4_is_multicast(oldflp->fl4_dst))
2641                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2642                                                               fl.fl4_scope);
2643                         else if (!oldflp->fl4_dst)
2644                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2645                                                               RT_SCOPE_HOST);
2646                 }
2647         }
2648
2649         if (!fl.fl4_dst) {
2650                 fl.fl4_dst = fl.fl4_src;
2651                 if (!fl.fl4_dst)
2652                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2653                 if (dev_out)
2654                         dev_put(dev_out);
2655                 dev_out = net->loopback_dev;
2656                 dev_hold(dev_out);
2657                 fl.oif = net->loopback_dev->ifindex;
2658                 res.type = RTN_LOCAL;
2659                 flags |= RTCF_LOCAL;
2660                 goto make_route;
2661         }
2662
2663         if (fib_lookup(net, &fl, &res)) {
2664                 res.fi = NULL;
2665                 if (oldflp->oif) {
2666                         /* Apparently, routing tables are wrong. Assume,
2667                            that the destination is on link.
2668
2669                            WHY? DW.
2670                            Because we are allowed to send to iface
2671                            even if it has NO routes and NO assigned
2672                            addresses. When oif is specified, routing
2673                            tables are looked up with only one purpose:
2674                            to catch if destination is gatewayed, rather than
2675                            direct. Moreover, if MSG_DONTROUTE is set,
2676                            we send packet, ignoring both routing tables
2677                            and ifaddr state. --ANK
2678
2679
2680                            We could make it even if oif is unknown,
2681                            likely IPv6, but we do not.
2682                          */
2683
2684                         if (fl.fl4_src == 0)
2685                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2686                                                               RT_SCOPE_LINK);
2687                         res.type = RTN_UNICAST;
2688                         goto make_route;
2689                 }
2690                 if (dev_out)
2691                         dev_put(dev_out);
2692                 err = -ENETUNREACH;
2693                 goto out;
2694         }
2695         free_res = 1;
2696
2697         if (res.type == RTN_LOCAL) {
2698                 if (!fl.fl4_src)
2699                         fl.fl4_src = fl.fl4_dst;
2700                 if (dev_out)
2701                         dev_put(dev_out);
2702                 dev_out = net->loopback_dev;
2703                 dev_hold(dev_out);
2704                 fl.oif = dev_out->ifindex;
2705                 if (res.fi)
2706                         fib_info_put(res.fi);
2707                 res.fi = NULL;
2708                 flags |= RTCF_LOCAL;
2709                 goto make_route;
2710         }
2711
2712 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2713         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2714                 fib_select_multipath(&fl, &res);
2715         else
2716 #endif
2717         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2718                 fib_select_default(net, &fl, &res);
2719
2720         if (!fl.fl4_src)
2721                 fl.fl4_src = FIB_RES_PREFSRC(res);
2722
2723         if (dev_out)
2724                 dev_put(dev_out);
2725         dev_out = FIB_RES_DEV(res);
2726         dev_hold(dev_out);
2727         fl.oif = dev_out->ifindex;
2728
2729
2730 make_route:
2731         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2732
2733
2734         if (free_res)
2735                 fib_res_put(&res);
2736         if (dev_out)
2737                 dev_put(dev_out);
2738 out:    return err;
2739 }
2740
2741 int __ip_route_output_key(struct net *net, struct rtable **rp,
2742                           const struct flowi *flp)
2743 {
2744         unsigned hash;
2745         struct rtable *rth;
2746
2747         if (!rt_caching(net))
2748                 goto slow_output;
2749
2750         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2751
2752         rcu_read_lock_bh();
2753         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2754                 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2755                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2756                     rth->fl.fl4_src == flp->fl4_src &&
2757                     rth->fl.iif == 0 &&
2758                     rth->fl.oif == flp->oif &&
2759                     rth->fl.mark == flp->mark &&
2760                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2761                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2762                     net_eq(dev_net(rth->u.dst.dev), net) &&
2763                     !rt_is_expired(rth)) {
2764                         dst_use(&rth->u.dst, jiffies);
2765                         RT_CACHE_STAT_INC(out_hit);
2766                         rcu_read_unlock_bh();
2767                         *rp = rth;
2768                         return 0;
2769                 }
2770                 RT_CACHE_STAT_INC(out_hlist_search);
2771         }
2772         rcu_read_unlock_bh();
2773
2774 slow_output:
2775         return ip_route_output_slow(net, rp, flp);
2776 }
2777
2778 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2779
2780 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2781 {
2782 }
2783
2784 static struct dst_ops ipv4_dst_blackhole_ops = {
2785         .family                 =       AF_INET,
2786         .protocol               =       cpu_to_be16(ETH_P_IP),
2787         .destroy                =       ipv4_dst_destroy,
2788         .check                  =       ipv4_dst_check,
2789         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2790         .entries                =       ATOMIC_INIT(0),
2791 };
2792
2793
2794 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2795 {
2796         struct rtable *ort = *rp;
2797         struct rtable *rt = (struct rtable *)
2798                 dst_alloc(&ipv4_dst_blackhole_ops);
2799
2800         if (rt) {
2801                 struct dst_entry *new = &rt->u.dst;
2802
2803                 atomic_set(&new->__refcnt, 1);
2804                 new->__use = 1;
2805                 new->input = dst_discard;
2806                 new->output = dst_discard;
2807                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2808
2809                 new->dev = ort->u.dst.dev;
2810                 if (new->dev)
2811                         dev_hold(new->dev);
2812
2813                 rt->fl = ort->fl;
2814
2815                 rt->idev = ort->idev;
2816                 if (rt->idev)
2817                         in_dev_hold(rt->idev);
2818                 rt->rt_genid = rt_genid(net);
2819                 rt->rt_flags = ort->rt_flags;
2820                 rt->rt_type = ort->rt_type;
2821                 rt->rt_dst = ort->rt_dst;
2822                 rt->rt_src = ort->rt_src;
2823                 rt->rt_iif = ort->rt_iif;
2824                 rt->rt_gateway = ort->rt_gateway;
2825                 rt->rt_spec_dst = ort->rt_spec_dst;
2826                 rt->peer = ort->peer;
2827                 if (rt->peer)
2828                         atomic_inc(&rt->peer->refcnt);
2829
2830                 dst_free(new);
2831         }
2832
2833         dst_release(&(*rp)->u.dst);
2834         *rp = rt;
2835         return (rt ? 0 : -ENOMEM);
2836 }
2837
2838 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2839                          struct sock *sk, int flags)
2840 {
2841         int err;
2842
2843         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2844                 return err;
2845
2846         if (flp->proto) {
2847                 if (!flp->fl4_src)
2848                         flp->fl4_src = (*rp)->rt_src;
2849                 if (!flp->fl4_dst)
2850                         flp->fl4_dst = (*rp)->rt_dst;
2851                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2852                                     flags ? XFRM_LOOKUP_WAIT : 0);
2853                 if (err == -EREMOTE)
2854                         err = ipv4_dst_blackhole(net, rp, flp);
2855
2856                 return err;
2857         }
2858
2859         return 0;
2860 }
2861
2862 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2863
2864 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2865 {
2866         return ip_route_output_flow(net, rp, flp, NULL, 0);
2867 }
2868
2869 static int rt_fill_info(struct net *net,
2870                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2871                         int nowait, unsigned int flags)
2872 {
2873         struct rtable *rt = skb_rtable(skb);
2874         struct rtmsg *r;
2875         struct nlmsghdr *nlh;
2876         long expires;
2877         u32 id = 0, ts = 0, tsage = 0, error;
2878
2879         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2880         if (nlh == NULL)
2881                 return -EMSGSIZE;
2882
2883         r = nlmsg_data(nlh);
2884         r->rtm_family    = AF_INET;
2885         r->rtm_dst_len  = 32;
2886         r->rtm_src_len  = 0;
2887         r->rtm_tos      = rt->fl.fl4_tos;
2888         r->rtm_table    = RT_TABLE_MAIN;
2889         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2890         r->rtm_type     = rt->rt_type;
2891         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2892         r->rtm_protocol = RTPROT_UNSPEC;
2893         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2894         if (rt->rt_flags & RTCF_NOTIFY)
2895                 r->rtm_flags |= RTM_F_NOTIFY;
2896
2897         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2898
2899         if (rt->fl.fl4_src) {
2900                 r->rtm_src_len = 32;
2901                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2902         }
2903         if (rt->u.dst.dev)
2904                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2905 #ifdef CONFIG_NET_CLS_ROUTE
2906         if (rt->u.dst.tclassid)
2907                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2908 #endif
2909         if (rt->fl.iif)
2910                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2911         else if (rt->rt_src != rt->fl.fl4_src)
2912                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2913
2914         if (rt->rt_dst != rt->rt_gateway)
2915                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2916
2917         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2918                 goto nla_put_failure;
2919
2920         error = rt->u.dst.error;
2921         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2922         if (rt->peer) {
2923                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2924                 if (rt->peer->tcp_ts_stamp) {
2925                         ts = rt->peer->tcp_ts;
2926                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2927                 }
2928         }
2929
2930         if (rt->fl.iif) {
2931 #ifdef CONFIG_IP_MROUTE
2932                 __be32 dst = rt->rt_dst;
2933
2934                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2935                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2936                         int err = ipmr_get_route(net, skb, r, nowait);
2937                         if (err <= 0) {
2938                                 if (!nowait) {
2939                                         if (err == 0)
2940                                                 return 0;
2941                                         goto nla_put_failure;
2942                                 } else {
2943                                         if (err == -EMSGSIZE)
2944                                                 goto nla_put_failure;
2945                                         error = err;
2946                                 }
2947                         }
2948                 } else
2949 #endif
2950                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2951         }
2952
2953         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2954                                expires, error) < 0)
2955                 goto nla_put_failure;
2956
2957         return nlmsg_end(skb, nlh);
2958
2959 nla_put_failure:
2960         nlmsg_cancel(skb, nlh);
2961         return -EMSGSIZE;
2962 }
2963
2964 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2965 {
2966         struct net *net = sock_net(in_skb->sk);
2967         struct rtmsg *rtm;
2968         struct nlattr *tb[RTA_MAX+1];
2969         struct rtable *rt = NULL;
2970         __be32 dst = 0;
2971         __be32 src = 0;
2972         u32 iif;
2973         int err;
2974         struct sk_buff *skb;
2975
2976         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2977         if (err < 0)
2978                 goto errout;
2979
2980         rtm = nlmsg_data(nlh);
2981
2982         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2983         if (skb == NULL) {
2984                 err = -ENOBUFS;
2985                 goto errout;
2986         }
2987
2988         /* Reserve room for dummy headers, this skb can pass
2989            through good chunk of routing engine.
2990          */
2991         skb_reset_mac_header(skb);
2992         skb_reset_network_header(skb);
2993
2994         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2995         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2996         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2997
2998         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2999         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3000         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3001
3002         if (iif) {
3003                 struct net_device *dev;
3004
3005                 dev = __dev_get_by_index(net, iif);
3006                 if (dev == NULL) {
3007                         err = -ENODEV;
3008                         goto errout_free;
3009                 }
3010
3011                 skb->protocol   = htons(ETH_P_IP);
3012                 skb->dev        = dev;
3013                 local_bh_disable();
3014                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3015                 local_bh_enable();
3016
3017                 rt = skb_rtable(skb);
3018                 if (err == 0 && rt->u.dst.error)
3019                         err = -rt->u.dst.error;
3020         } else {
3021                 struct flowi fl = {
3022                         .nl_u = {
3023                                 .ip4_u = {
3024                                         .daddr = dst,
3025                                         .saddr = src,
3026                                         .tos = rtm->rtm_tos,
3027                                 },
3028                         },
3029                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3030                 };
3031                 err = ip_route_output_key(net, &rt, &fl);
3032         }
3033
3034         if (err)
3035                 goto errout_free;
3036
3037         skb_dst_set(skb, &rt->u.dst);
3038         if (rtm->rtm_flags & RTM_F_NOTIFY)
3039                 rt->rt_flags |= RTCF_NOTIFY;
3040
3041         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3042                            RTM_NEWROUTE, 0, 0);
3043         if (err <= 0)
3044                 goto errout_free;
3045
3046         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3047 errout:
3048         return err;
3049
3050 errout_free:
3051         kfree_skb(skb);
3052         goto errout;
3053 }
3054
3055 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3056 {
3057         struct rtable *rt;
3058         int h, s_h;
3059         int idx, s_idx;
3060         struct net *net;
3061
3062         net = sock_net(skb->sk);
3063
3064         s_h = cb->args[0];
3065         if (s_h < 0)
3066                 s_h = 0;
3067         s_idx = idx = cb->args[1];
3068         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3069                 if (!rt_hash_table[h].chain)
3070                         continue;
3071                 rcu_read_lock_bh();
3072                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3073                      rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3074                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3075                                 continue;
3076                         if (rt_is_expired(rt))
3077                                 continue;
3078                         skb_dst_set_noref(skb, &rt->u.dst);
3079                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3080                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3081                                          1, NLM_F_MULTI) <= 0) {
3082                                 skb_dst_drop(skb);
3083                                 rcu_read_unlock_bh();
3084                                 goto done;
3085                         }
3086                         skb_dst_drop(skb);
3087                 }
3088                 rcu_read_unlock_bh();
3089         }
3090
3091 done:
3092         cb->args[0] = h;
3093         cb->args[1] = idx;
3094         return skb->len;
3095 }
3096
3097 void ip_rt_multicast_event(struct in_device *in_dev)
3098 {
3099         rt_cache_flush(dev_net(in_dev->dev), 0);
3100 }
3101
3102 #ifdef CONFIG_SYSCTL
3103 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3104                                         void __user *buffer,
3105                                         size_t *lenp, loff_t *ppos)
3106 {
3107         if (write) {
3108                 int flush_delay;
3109                 ctl_table ctl;
3110                 struct net *net;
3111
3112                 memcpy(&ctl, __ctl, sizeof(ctl));
3113                 ctl.data = &flush_delay;
3114                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3115
3116                 net = (struct net *)__ctl->extra1;
3117                 rt_cache_flush(net, flush_delay);
3118                 return 0;
3119         }
3120
3121         return -EINVAL;
3122 }
3123
3124 static ctl_table ipv4_route_table[] = {
3125         {
3126                 .procname       = "gc_thresh",
3127                 .data           = &ipv4_dst_ops.gc_thresh,
3128                 .maxlen         = sizeof(int),
3129                 .mode           = 0644,
3130                 .proc_handler   = proc_dointvec,
3131         },
3132         {
3133                 .procname       = "max_size",
3134                 .data           = &ip_rt_max_size,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec_route,
3138         },
3139         {
3140                 /*  Deprecated. Use gc_min_interval_ms */
3141
3142                 .procname       = "gc_min_interval",
3143                 .data           = &ip_rt_gc_min_interval,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec_jiffies,
3147         },
3148         {
3149                 .procname       = "gc_min_interval_ms",
3150                 .data           = &ip_rt_gc_min_interval,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec_ms_jiffies,
3154         },
3155         {
3156                 .procname       = "gc_timeout",
3157                 .data           = &ip_rt_gc_timeout,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec_jiffies,
3161         },
3162         {
3163                 .procname       = "gc_interval",
3164                 .data           = &ip_rt_gc_interval,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec_jiffies,
3168         },
3169         {
3170                 .procname       = "redirect_load",
3171                 .data           = &ip_rt_redirect_load,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec_route,
3175         },
3176         {
3177                 .procname       = "redirect_number",
3178                 .data           = &ip_rt_redirect_number,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec,
3182         },
3183         {
3184                 .procname       = "redirect_silence",
3185                 .data           = &ip_rt_redirect_silence,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec,
3189         },
3190         {
3191                 .procname       = "error_cost",
3192                 .data           = &ip_rt_error_cost,
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0644,
3195                 .proc_handler   = proc_dointvec,
3196         },
3197         {
3198                 .procname       = "error_burst",
3199                 .data           = &ip_rt_error_burst,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec,
3203         },
3204         {
3205                 .procname       = "gc_elasticity",
3206                 .data           = &ip_rt_gc_elasticity,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec,
3210         },
3211         {
3212                 .procname       = "mtu_expires",
3213                 .data           = &ip_rt_mtu_expires,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec_jiffies,
3217         },
3218         {
3219                 .procname       = "min_pmtu",
3220                 .data           = &ip_rt_min_pmtu,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec,
3224         },
3225         {
3226                 .procname       = "min_adv_mss",
3227                 .data           = &ip_rt_min_advmss,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         { }
3233 };
3234
3235 static struct ctl_table empty[1];
3236
3237 static struct ctl_table ipv4_skeleton[] =
3238 {
3239         { .procname = "route",
3240           .mode = 0555, .child = ipv4_route_table},
3241         { .procname = "neigh",
3242           .mode = 0555, .child = empty},
3243         { }
3244 };
3245
3246 static __net_initdata struct ctl_path ipv4_path[] = {
3247         { .procname = "net", },
3248         { .procname = "ipv4", },
3249         { },
3250 };
3251
3252 static struct ctl_table ipv4_route_flush_table[] = {
3253         {
3254                 .procname       = "flush",
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0200,
3257                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3258         },
3259         { },
3260 };
3261
3262 static __net_initdata struct ctl_path ipv4_route_path[] = {
3263         { .procname = "net", },
3264         { .procname = "ipv4", },
3265         { .procname = "route", },
3266         { },
3267 };
3268
3269 static __net_init int sysctl_route_net_init(struct net *net)
3270 {
3271         struct ctl_table *tbl;
3272
3273         tbl = ipv4_route_flush_table;
3274         if (!net_eq(net, &init_net)) {
3275                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3276                 if (tbl == NULL)
3277                         goto err_dup;
3278         }
3279         tbl[0].extra1 = net;
3280
3281         net->ipv4.route_hdr =
3282                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3283         if (net->ipv4.route_hdr == NULL)
3284                 goto err_reg;
3285         return 0;
3286
3287 err_reg:
3288         if (tbl != ipv4_route_flush_table)
3289                 kfree(tbl);
3290 err_dup:
3291         return -ENOMEM;
3292 }
3293
3294 static __net_exit void sysctl_route_net_exit(struct net *net)
3295 {
3296         struct ctl_table *tbl;
3297
3298         tbl = net->ipv4.route_hdr->ctl_table_arg;
3299         unregister_net_sysctl_table(net->ipv4.route_hdr);
3300         BUG_ON(tbl == ipv4_route_flush_table);
3301         kfree(tbl);
3302 }
3303
3304 static __net_initdata struct pernet_operations sysctl_route_ops = {
3305         .init = sysctl_route_net_init,
3306         .exit = sysctl_route_net_exit,
3307 };
3308 #endif
3309
3310 static __net_init int rt_genid_init(struct net *net)
3311 {
3312         get_random_bytes(&net->ipv4.rt_genid,
3313                          sizeof(net->ipv4.rt_genid));
3314         return 0;
3315 }
3316
3317 static __net_initdata struct pernet_operations rt_genid_ops = {
3318         .init = rt_genid_init,
3319 };
3320
3321
3322 #ifdef CONFIG_NET_CLS_ROUTE
3323 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3324 #endif /* CONFIG_NET_CLS_ROUTE */
3325
3326 static __initdata unsigned long rhash_entries;
3327 static int __init set_rhash_entries(char *str)
3328 {
3329         if (!str)
3330                 return 0;
3331         rhash_entries = simple_strtoul(str, &str, 0);
3332         return 1;
3333 }
3334 __setup("rhash_entries=", set_rhash_entries);
3335
3336 int __init ip_rt_init(void)
3337 {
3338         int rc = 0;
3339
3340 #ifdef CONFIG_NET_CLS_ROUTE
3341         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3342         if (!ip_rt_acct)
3343                 panic("IP: failed to allocate ip_rt_acct\n");
3344 #endif
3345
3346         ipv4_dst_ops.kmem_cachep =
3347                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3348                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3349
3350         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3351
3352         rt_hash_table = (struct rt_hash_bucket *)
3353                 alloc_large_system_hash("IP route cache",
3354                                         sizeof(struct rt_hash_bucket),
3355                                         rhash_entries,
3356                                         (totalram_pages >= 128 * 1024) ?
3357                                         15 : 17,
3358                                         0,
3359                                         &rt_hash_log,
3360                                         &rt_hash_mask,
3361                                         rhash_entries ? 0 : 512 * 1024);
3362         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3363         rt_hash_lock_init();
3364
3365         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3366         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3367
3368 #ifdef CONFIG_PROC_FS
3369         mutex_init(&ipv4_route_lock);
3370 #endif
3371
3372         mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
3373                         &net_rx_reserve);
3374         mem_reserve_kmem_cache_set(&ipv4_route_reserve,
3375                         ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
3376
3377 #ifdef CONFIG_PROCFS
3378         mutex_init(&ipv4_route_lock);
3379 #endif
3380
3381         mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
3382                         &net_rx_reserve);
3383         mem_reserve_kmem_cache_set(&ipv4_route_reserve,
3384                         ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
3385
3386         devinet_init();
3387         ip_fib_init();
3388
3389         /* All the timers, started at system startup tend
3390            to synchronize. Perturb it a bit.
3391          */
3392         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3393         expires_ljiffies = jiffies;
3394         schedule_delayed_work(&expires_work,
3395                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3396
3397         if (ip_rt_proc_init())
3398                 printk(KERN_ERR "Unable to create route proc files\n");
3399 #ifdef CONFIG_XFRM
3400         xfrm_init();
3401         xfrm4_init(ip_rt_max_size);
3402 #endif
3403         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3404
3405 #ifdef CONFIG_SYSCTL
3406         register_pernet_subsys(&sysctl_route_ops);
3407 #endif
3408         register_pernet_subsys(&rt_genid_ops);
3409         return rc;
3410 }
3411
3412 #ifdef CONFIG_SYSCTL
3413 /*
3414  * We really need to sanitize the damn ipv4 init order, then all
3415  * this nonsense will go away.
3416  */
3417 void __init ip_static_sysctl_init(void)
3418 {
3419         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3420 }
3421 #endif
3422
3423 EXPORT_SYMBOL(__ip_select_ident);
3424 EXPORT_SYMBOL(ip_route_output_key);