ipv6: fix problem with expired dst cache
[linux-flexiantxendom0-3.2.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 255,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288
289         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290                 dst_release(dst->from);
291
292         if (peer) {
293                 rt->rt6i_peer = NULL;
294                 inet_putpeer(peer);
295         }
296 }
297
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
299
300 static u32 rt6_peer_genid(void)
301 {
302         return atomic_read(&__rt6_peer_genid);
303 }
304
305 void rt6_bind_peer(struct rt6_info *rt, int create)
306 {
307         struct inet_peer *peer;
308
309         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
310         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
311                 inet_putpeer(peer);
312         else
313                 rt->rt6i_peer_genid = rt6_peer_genid();
314 }
315
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
317                            int how)
318 {
319         struct rt6_info *rt = (struct rt6_info *)dst;
320         struct inet6_dev *idev = rt->rt6i_idev;
321         struct net_device *loopback_dev =
322                 dev_net(dev)->loopback_dev;
323
324         if (dev != loopback_dev && idev && idev->dev == dev) {
325                 struct inet6_dev *loopback_idev =
326                         in6_dev_get(loopback_dev);
327                 if (loopback_idev) {
328                         rt->rt6i_idev = loopback_idev;
329                         in6_dev_put(idev);
330                 }
331         }
332 }
333
334 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
335 {
336         struct rt6_info *ort = NULL;
337
338         if (rt->rt6i_flags & RTF_EXPIRES) {
339                 if (time_after(jiffies, rt->dst.expires))
340                         return 1;
341         } else if (rt->dst.from) {
342                 ort = (struct rt6_info *) rt->dst.from;
343                 return (ort->rt6i_flags & RTF_EXPIRES) &&
344                         time_after(jiffies, ort->dst.expires);
345         }
346         return 0;
347 }
348
349 static inline int rt6_need_strict(const struct in6_addr *daddr)
350 {
351         return ipv6_addr_type(daddr) &
352                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
353 }
354
355 /*
356  *      Route lookup. Any table->tb6_lock is implied.
357  */
358
359 static inline struct rt6_info *rt6_device_match(struct net *net,
360                                                     struct rt6_info *rt,
361                                                     const struct in6_addr *saddr,
362                                                     int oif,
363                                                     int flags)
364 {
365         struct rt6_info *local = NULL;
366         struct rt6_info *sprt;
367
368         if (!oif && ipv6_addr_any(saddr))
369                 goto out;
370
371         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372                 struct net_device *dev = sprt->dst.dev;
373
374                 if (oif) {
375                         if (dev->ifindex == oif)
376                                 return sprt;
377                         if (dev->flags & IFF_LOOPBACK) {
378                                 if (!sprt->rt6i_idev ||
379                                     sprt->rt6i_idev->dev->ifindex != oif) {
380                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
381                                                 continue;
382                                         if (local && (!oif ||
383                                                       local->rt6i_idev->dev->ifindex == oif))
384                                                 continue;
385                                 }
386                                 local = sprt;
387                         }
388                 } else {
389                         if (ipv6_chk_addr(net, saddr, dev,
390                                           flags & RT6_LOOKUP_F_IFACE))
391                                 return sprt;
392                 }
393         }
394
395         if (oif) {
396                 if (local)
397                         return local;
398
399                 if (flags & RT6_LOOKUP_F_IFACE)
400                         return net->ipv6.ip6_null_entry;
401         }
402 out:
403         return rt;
404 }
405
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
408 {
409         struct neighbour *neigh;
410         /*
411          * Okay, this does not seem to be appropriate
412          * for now, however, we need to check if it
413          * is really so; aka Router Reachability Probing.
414          *
415          * Router Reachability Probe MUST be rate-limited
416          * to no more than one per minute.
417          */
418         rcu_read_lock();
419         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420         if (!neigh || (neigh->nud_state & NUD_VALID))
421                 goto out;
422         read_lock_bh(&neigh->lock);
423         if (!(neigh->nud_state & NUD_VALID) &&
424             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425                 struct in6_addr mcaddr;
426                 struct in6_addr *target;
427
428                 neigh->updated = jiffies;
429                 read_unlock_bh(&neigh->lock);
430
431                 target = (struct in6_addr *)&neigh->primary_key;
432                 addrconf_addr_solict_mult(target, &mcaddr);
433                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
434         } else {
435                 read_unlock_bh(&neigh->lock);
436         }
437 out:
438         rcu_read_unlock();
439 }
440 #else
441 static inline void rt6_probe(struct rt6_info *rt)
442 {
443 }
444 #endif
445
446 /*
447  * Default Router Selection (RFC 2461 6.3.6)
448  */
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
450 {
451         struct net_device *dev = rt->dst.dev;
452         if (!oif || dev->ifindex == oif)
453                 return 2;
454         if ((dev->flags & IFF_LOOPBACK) &&
455             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
456                 return 1;
457         return 0;
458 }
459
460 static inline int rt6_check_neigh(struct rt6_info *rt)
461 {
462         struct neighbour *neigh;
463         int m;
464
465         rcu_read_lock();
466         neigh = dst_get_neighbour_noref(&rt->dst);
467         if (rt->rt6i_flags & RTF_NONEXTHOP ||
468             !(rt->rt6i_flags & RTF_GATEWAY))
469                 m = 1;
470         else if (neigh) {
471                 read_lock_bh(&neigh->lock);
472                 if (neigh->nud_state & NUD_VALID)
473                         m = 2;
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475                 else if (neigh->nud_state & NUD_FAILED)
476                         m = 0;
477 #endif
478                 else
479                         m = 1;
480                 read_unlock_bh(&neigh->lock);
481         } else
482                 m = 0;
483         rcu_read_unlock();
484         return m;
485 }
486
487 static int rt6_score_route(struct rt6_info *rt, int oif,
488                            int strict)
489 {
490         int m, n;
491
492         m = rt6_check_dev(rt, oif);
493         if (!m && (strict & RT6_LOOKUP_F_IFACE))
494                 return -1;
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
497 #endif
498         n = rt6_check_neigh(rt);
499         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
500                 return -1;
501         return m;
502 }
503
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505                                    int *mpri, struct rt6_info *match)
506 {
507         int m;
508
509         if (rt6_check_expired(rt))
510                 goto out;
511
512         m = rt6_score_route(rt, oif, strict);
513         if (m < 0)
514                 goto out;
515
516         if (m > *mpri) {
517                 if (strict & RT6_LOOKUP_F_REACHABLE)
518                         rt6_probe(match);
519                 *mpri = m;
520                 match = rt;
521         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
522                 rt6_probe(rt);
523         }
524
525 out:
526         return match;
527 }
528
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530                                      struct rt6_info *rr_head,
531                                      u32 metric, int oif, int strict)
532 {
533         struct rt6_info *rt, *match;
534         int mpri = -1;
535
536         match = NULL;
537         for (rt = rr_head; rt && rt->rt6i_metric == metric;
538              rt = rt->dst.rt6_next)
539                 match = find_match(rt, oif, strict, &mpri, match);
540         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541              rt = rt->dst.rt6_next)
542                 match = find_match(rt, oif, strict, &mpri, match);
543
544         return match;
545 }
546
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
548 {
549         struct rt6_info *match, *rt0;
550         struct net *net;
551
552         rt0 = fn->rr_ptr;
553         if (!rt0)
554                 fn->rr_ptr = rt0 = fn->leaf;
555
556         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
557
558         if (!match &&
559             (strict & RT6_LOOKUP_F_REACHABLE)) {
560                 struct rt6_info *next = rt0->dst.rt6_next;
561
562                 /* no entries matched; do round-robin */
563                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
564                         next = fn->leaf;
565
566                 if (next != rt0)
567                         fn->rr_ptr = next;
568         }
569
570         net = dev_net(rt0->dst.dev);
571         return match ? match : net->ipv6.ip6_null_entry;
572 }
573
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576                   const struct in6_addr *gwaddr)
577 {
578         struct net *net = dev_net(dev);
579         struct route_info *rinfo = (struct route_info *) opt;
580         struct in6_addr prefix_buf, *prefix;
581         unsigned int pref;
582         unsigned long lifetime;
583         struct rt6_info *rt;
584
585         if (len < sizeof(struct route_info)) {
586                 return -EINVAL;
587         }
588
589         /* Sanity check for prefix_len and length */
590         if (rinfo->length > 3) {
591                 return -EINVAL;
592         } else if (rinfo->prefix_len > 128) {
593                 return -EINVAL;
594         } else if (rinfo->prefix_len > 64) {
595                 if (rinfo->length < 2) {
596                         return -EINVAL;
597                 }
598         } else if (rinfo->prefix_len > 0) {
599                 if (rinfo->length < 1) {
600                         return -EINVAL;
601                 }
602         }
603
604         pref = rinfo->route_pref;
605         if (pref == ICMPV6_ROUTER_PREF_INVALID)
606                 return -EINVAL;
607
608         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
609
610         if (rinfo->length == 3)
611                 prefix = (struct in6_addr *)rinfo->prefix;
612         else {
613                 /* this function is safe */
614                 ipv6_addr_prefix(&prefix_buf,
615                                  (struct in6_addr *)rinfo->prefix,
616                                  rinfo->prefix_len);
617                 prefix = &prefix_buf;
618         }
619
620         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
621                                 dev->ifindex);
622
623         if (rt && !lifetime) {
624                 ip6_del_rt(rt);
625                 rt = NULL;
626         }
627
628         if (!rt && lifetime)
629                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
630                                         pref);
631         else if (rt)
632                 rt->rt6i_flags = RTF_ROUTEINFO |
633                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
634
635         if (rt) {
636                 if (!addrconf_finite_timeout(lifetime))
637                         rt6_clean_expires(rt);
638                 else
639                         rt6_set_expires(rt, jiffies + HZ * lifetime);
640
641                 dst_release(&rt->dst);
642         }
643         return 0;
644 }
645 #endif
646
647 #define BACKTRACK(__net, saddr)                 \
648 do { \
649         if (rt == __net->ipv6.ip6_null_entry) { \
650                 struct fib6_node *pn; \
651                 while (1) { \
652                         if (fn->fn_flags & RTN_TL_ROOT) \
653                                 goto out; \
654                         pn = fn->parent; \
655                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
657                         else \
658                                 fn = pn; \
659                         if (fn->fn_flags & RTN_RTINFO) \
660                                 goto restart; \
661                 } \
662         } \
663 } while (0)
664
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666                                              struct fib6_table *table,
667                                              struct flowi6 *fl6, int flags)
668 {
669         struct fib6_node *fn;
670         struct rt6_info *rt;
671
672         read_lock_bh(&table->tb6_lock);
673         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
674 restart:
675         rt = fn->leaf;
676         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677         BACKTRACK(net, &fl6->saddr);
678 out:
679         dst_use(&rt->dst, jiffies);
680         read_unlock_bh(&table->tb6_lock);
681         return rt;
682
683 }
684
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
686                                     int flags)
687 {
688         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
689 }
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
691
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693                             const struct in6_addr *saddr, int oif, int strict)
694 {
695         struct flowi6 fl6 = {
696                 .flowi6_oif = oif,
697                 .daddr = *daddr,
698         };
699         struct dst_entry *dst;
700         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
701
702         if (saddr) {
703                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704                 flags |= RT6_LOOKUP_F_HAS_SADDR;
705         }
706
707         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
708         if (dst->error == 0)
709                 return (struct rt6_info *) dst;
710
711         dst_release(dst);
712
713         return NULL;
714 }
715
716 EXPORT_SYMBOL(rt6_lookup);
717
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719    It takes new route entry, the addition fails by any reason the
720    route is freed. In any case, if caller does not hold it, it may
721    be destroyed.
722  */
723
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
725 {
726         int err;
727         struct fib6_table *table;
728
729         table = rt->rt6i_table;
730         write_lock_bh(&table->tb6_lock);
731         err = fib6_add(&table->tb6_root, rt, info);
732         write_unlock_bh(&table->tb6_lock);
733
734         return err;
735 }
736
737 int ip6_ins_rt(struct rt6_info *rt)
738 {
739         struct nl_info info = {
740                 .nl_net = dev_net(rt->dst.dev),
741         };
742         return __ip6_ins_rt(rt, &info);
743 }
744
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746                                       const struct in6_addr *daddr,
747                                       const struct in6_addr *saddr)
748 {
749         struct rt6_info *rt;
750
751         /*
752          *      Clone the route.
753          */
754
755         rt = ip6_rt_copy(ort, daddr);
756
757         if (rt) {
758                 int attempts = !in_softirq();
759
760                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761                         if (ort->rt6i_dst.plen != 128 &&
762                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763                                 rt->rt6i_flags |= RTF_ANYCAST;
764                         rt->rt6i_gateway = *daddr;
765                 }
766
767                 rt->rt6i_flags |= RTF_CACHE;
768
769 #ifdef CONFIG_IPV6_SUBTREES
770                 if (rt->rt6i_src.plen && saddr) {
771                         rt->rt6i_src.addr = *saddr;
772                         rt->rt6i_src.plen = 128;
773                 }
774 #endif
775
776         retry:
777                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778                         struct net *net = dev_net(rt->dst.dev);
779                         int saved_rt_min_interval =
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781                         int saved_rt_elasticity =
782                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
783
784                         if (attempts-- > 0) {
785                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
787
788                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
789
790                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
791                                         saved_rt_elasticity;
792                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793                                         saved_rt_min_interval;
794                                 goto retry;
795                         }
796
797                         if (net_ratelimit())
798                                 printk(KERN_WARNING
799                                        "ipv6: Neighbour table overflow.\n");
800                         dst_free(&rt->dst);
801                         return NULL;
802                 }
803         }
804
805         return rt;
806 }
807
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809                                         const struct in6_addr *daddr)
810 {
811         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
812
813         if (rt) {
814                 rt->rt6i_flags |= RTF_CACHE;
815                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
816         }
817         return rt;
818 }
819
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821                                       struct flowi6 *fl6, int flags)
822 {
823         struct fib6_node *fn;
824         struct rt6_info *rt, *nrt;
825         int strict = 0;
826         int attempts = 3;
827         int err;
828         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
829
830         strict |= flags & RT6_LOOKUP_F_IFACE;
831
832 relookup:
833         read_lock_bh(&table->tb6_lock);
834
835 restart_2:
836         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
837
838 restart:
839         rt = rt6_select(fn, oif, strict | reachable);
840
841         BACKTRACK(net, &fl6->saddr);
842         if (rt == net->ipv6.ip6_null_entry ||
843             rt->rt6i_flags & RTF_CACHE)
844                 goto out;
845
846         dst_hold(&rt->dst);
847         read_unlock_bh(&table->tb6_lock);
848
849         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
850                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
851         else if (!(rt->dst.flags & DST_HOST))
852                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
853         else
854                 goto out2;
855
856         dst_release(&rt->dst);
857         rt = nrt ? : net->ipv6.ip6_null_entry;
858
859         dst_hold(&rt->dst);
860         if (nrt) {
861                 err = ip6_ins_rt(nrt);
862                 if (!err)
863                         goto out2;
864         }
865
866         if (--attempts <= 0)
867                 goto out2;
868
869         /*
870          * Race condition! In the gap, when table->tb6_lock was
871          * released someone could insert this route.  Relookup.
872          */
873         dst_release(&rt->dst);
874         goto relookup;
875
876 out:
877         if (reachable) {
878                 reachable = 0;
879                 goto restart_2;
880         }
881         dst_hold(&rt->dst);
882         read_unlock_bh(&table->tb6_lock);
883 out2:
884         rt->dst.lastuse = jiffies;
885         rt->dst.__use++;
886
887         return rt;
888 }
889
890 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
891                                             struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
894 }
895
896 static struct dst_entry *ip6_route_input_lookup(struct net *net,
897                                                 struct net_device *dev,
898                                                 struct flowi6 *fl6, int flags)
899 {
900         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
904 }
905
906 void ip6_route_input(struct sk_buff *skb)
907 {
908         const struct ipv6hdr *iph = ipv6_hdr(skb);
909         struct net *net = dev_net(skb->dev);
910         int flags = RT6_LOOKUP_F_HAS_SADDR;
911         struct flowi6 fl6 = {
912                 .flowi6_iif = skb->dev->ifindex,
913                 .daddr = iph->daddr,
914                 .saddr = iph->saddr,
915                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
916                 .flowi6_mark = skb->mark,
917                 .flowi6_proto = iph->nexthdr,
918         };
919
920         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
921 }
922
923 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
924                                              struct flowi6 *fl6, int flags)
925 {
926         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
927 }
928
929 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
930                                     struct flowi6 *fl6)
931 {
932         int flags = 0;
933
934         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
935                 flags |= RT6_LOOKUP_F_IFACE;
936
937         if (!ipv6_addr_any(&fl6->saddr))
938                 flags |= RT6_LOOKUP_F_HAS_SADDR;
939         else if (sk)
940                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
941
942         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
943 }
944
945 EXPORT_SYMBOL(ip6_route_output);
946
947 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
948 {
949         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
950         struct dst_entry *new = NULL;
951
952         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
953         if (rt) {
954                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
955
956                 new = &rt->dst;
957
958                 new->__use = 1;
959                 new->input = dst_discard;
960                 new->output = dst_discard;
961
962                 if (dst_metrics_read_only(&ort->dst))
963                         new->_metrics = ort->dst._metrics;
964                 else
965                         dst_copy_metrics(new, &ort->dst);
966                 rt->rt6i_idev = ort->rt6i_idev;
967                 if (rt->rt6i_idev)
968                         in6_dev_hold(rt->rt6i_idev);
969
970                 rt->rt6i_gateway = ort->rt6i_gateway;
971                 rt->rt6i_flags = ort->rt6i_flags;
972                 rt6_clean_expires(rt);
973                 rt->rt6i_metric = 0;
974
975                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
976 #ifdef CONFIG_IPV6_SUBTREES
977                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
978 #endif
979
980                 dst_free(new);
981         }
982
983         dst_release(dst_orig);
984         return new ? new : ERR_PTR(-ENOMEM);
985 }
986
987 /*
988  *      Destination cache support functions
989  */
990
991 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
992 {
993         struct rt6_info *rt;
994
995         rt = (struct rt6_info *) dst;
996
997         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
998                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
999                         if (!rt->rt6i_peer)
1000                                 rt6_bind_peer(rt, 0);
1001                         rt->rt6i_peer_genid = rt6_peer_genid();
1002                 }
1003                 return dst;
1004         }
1005         return NULL;
1006 }
1007
1008 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1009 {
1010         struct rt6_info *rt = (struct rt6_info *) dst;
1011
1012         if (rt) {
1013                 if (rt->rt6i_flags & RTF_CACHE) {
1014                         if (rt6_check_expired(rt)) {
1015                                 ip6_del_rt(rt);
1016                                 dst = NULL;
1017                         }
1018                 } else {
1019                         dst_release(dst);
1020                         dst = NULL;
1021                 }
1022         }
1023         return dst;
1024 }
1025
1026 static void ip6_link_failure(struct sk_buff *skb)
1027 {
1028         struct rt6_info *rt;
1029
1030         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1031
1032         rt = (struct rt6_info *) skb_dst(skb);
1033         if (rt) {
1034                 if (rt->rt6i_flags & RTF_CACHE)
1035                         rt6_update_expires(rt, 0);
1036                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037                         rt->rt6i_node->fn_sernum = -1;
1038         }
1039 }
1040
1041 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1042 {
1043         struct rt6_info *rt6 = (struct rt6_info*)dst;
1044
1045         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046                 rt6->rt6i_flags |= RTF_MODIFIED;
1047                 if (mtu < IPV6_MIN_MTU) {
1048                         u32 features = dst_metric(dst, RTAX_FEATURES);
1049                         mtu = IPV6_MIN_MTU;
1050                         features |= RTAX_FEATURE_ALLFRAG;
1051                         dst_metric_set(dst, RTAX_FEATURES, features);
1052                 }
1053                 dst_metric_set(dst, RTAX_MTU, mtu);
1054         }
1055 }
1056
1057 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1058 {
1059         struct net_device *dev = dst->dev;
1060         unsigned int mtu = dst_mtu(dst);
1061         struct net *net = dev_net(dev);
1062
1063         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1064
1065         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067
1068         /*
1069          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071          * IPV6_MAXPLEN is also valid and means: "any MSS,
1072          * rely only on pmtu discovery"
1073          */
1074         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1075                 mtu = IPV6_MAXPLEN;
1076         return mtu;
1077 }
1078
1079 static unsigned int ip6_mtu(const struct dst_entry *dst)
1080 {
1081         struct inet6_dev *idev;
1082         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1083
1084         if (mtu)
1085                 return mtu;
1086
1087         mtu = IPV6_MIN_MTU;
1088
1089         rcu_read_lock();
1090         idev = __in6_dev_get(dst->dev);
1091         if (idev)
1092                 mtu = idev->cnf.mtu6;
1093         rcu_read_unlock();
1094
1095         return mtu;
1096 }
1097
1098 static struct dst_entry *icmp6_dst_gc_list;
1099 static DEFINE_SPINLOCK(icmp6_dst_lock);
1100
1101 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102                                   struct neighbour *neigh,
1103                                   struct flowi6 *fl6)
1104 {
1105         struct dst_entry *dst;
1106         struct rt6_info *rt;
1107         struct inet6_dev *idev = in6_dev_get(dev);
1108         struct net *net = dev_net(dev);
1109
1110         if (unlikely(!idev))
1111                 return ERR_PTR(-ENODEV);
1112
1113         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114         if (unlikely(!rt)) {
1115                 in6_dev_put(idev);
1116                 dst = ERR_PTR(-ENOMEM);
1117                 goto out;
1118         }
1119
1120         if (neigh)
1121                 neigh_hold(neigh);
1122         else {
1123                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124                 if (IS_ERR(neigh)) {
1125                         in6_dev_put(idev);
1126                         dst_free(&rt->dst);
1127                         return ERR_CAST(neigh);
1128                 }
1129         }
1130
1131         rt->dst.flags |= DST_HOST;
1132         rt->dst.output  = ip6_output;
1133         dst_set_neighbour(&rt->dst, neigh);
1134         atomic_set(&rt->dst.__refcnt, 1);
1135         rt->rt6i_dst.addr = fl6->daddr;
1136         rt->rt6i_dst.plen = 128;
1137         rt->rt6i_idev     = idev;
1138         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1139
1140         spin_lock_bh(&icmp6_dst_lock);
1141         rt->dst.next = icmp6_dst_gc_list;
1142         icmp6_dst_gc_list = &rt->dst;
1143         spin_unlock_bh(&icmp6_dst_lock);
1144
1145         fib6_force_start_gc(net);
1146
1147         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1148
1149 out:
1150         return dst;
1151 }
1152
1153 int icmp6_dst_gc(void)
1154 {
1155         struct dst_entry *dst, **pprev;
1156         int more = 0;
1157
1158         spin_lock_bh(&icmp6_dst_lock);
1159         pprev = &icmp6_dst_gc_list;
1160
1161         while ((dst = *pprev) != NULL) {
1162                 if (!atomic_read(&dst->__refcnt)) {
1163                         *pprev = dst->next;
1164                         dst_free(dst);
1165                 } else {
1166                         pprev = &dst->next;
1167                         ++more;
1168                 }
1169         }
1170
1171         spin_unlock_bh(&icmp6_dst_lock);
1172
1173         return more;
1174 }
1175
1176 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177                             void *arg)
1178 {
1179         struct dst_entry *dst, **pprev;
1180
1181         spin_lock_bh(&icmp6_dst_lock);
1182         pprev = &icmp6_dst_gc_list;
1183         while ((dst = *pprev) != NULL) {
1184                 struct rt6_info *rt = (struct rt6_info *) dst;
1185                 if (func(rt, arg)) {
1186                         *pprev = dst->next;
1187                         dst_free(dst);
1188                 } else {
1189                         pprev = &dst->next;
1190                 }
1191         }
1192         spin_unlock_bh(&icmp6_dst_lock);
1193 }
1194
1195 static int ip6_dst_gc(struct dst_ops *ops)
1196 {
1197         unsigned long now = jiffies;
1198         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204         int entries;
1205
1206         entries = dst_entries_get_fast(ops);
1207         if (time_after(rt_last_gc + rt_min_interval, now) &&
1208             entries <= rt_max_size)
1209                 goto out;
1210
1211         net->ipv6.ip6_rt_gc_expire++;
1212         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213         net->ipv6.ip6_rt_last_gc = now;
1214         entries = dst_entries_get_slow(ops);
1215         if (entries < ops->gc_thresh)
1216                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1217 out:
1218         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219         return entries > rt_max_size;
1220 }
1221
1222 /* Clean host part of a prefix. Not necessary in radix tree,
1223    but results in cleaner routing tables.
1224
1225    Remove it only when all the things will work!
1226  */
1227
1228 int ip6_dst_hoplimit(struct dst_entry *dst)
1229 {
1230         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231         if (hoplimit == 0) {
1232                 struct net_device *dev = dst->dev;
1233                 struct inet6_dev *idev;
1234
1235                 rcu_read_lock();
1236                 idev = __in6_dev_get(dev);
1237                 if (idev)
1238                         hoplimit = idev->cnf.hop_limit;
1239                 else
1240                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1241                 rcu_read_unlock();
1242         }
1243         return hoplimit;
1244 }
1245 EXPORT_SYMBOL(ip6_dst_hoplimit);
1246
1247 /*
1248  *
1249  */
1250
1251 int ip6_route_add(struct fib6_config *cfg)
1252 {
1253         int err;
1254         struct net *net = cfg->fc_nlinfo.nl_net;
1255         struct rt6_info *rt = NULL;
1256         struct net_device *dev = NULL;
1257         struct inet6_dev *idev = NULL;
1258         struct fib6_table *table;
1259         int addr_type;
1260
1261         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1262                 return -EINVAL;
1263 #ifndef CONFIG_IPV6_SUBTREES
1264         if (cfg->fc_src_len)
1265                 return -EINVAL;
1266 #endif
1267         if (cfg->fc_ifindex) {
1268                 err = -ENODEV;
1269                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1270                 if (!dev)
1271                         goto out;
1272                 idev = in6_dev_get(dev);
1273                 if (!idev)
1274                         goto out;
1275         }
1276
1277         if (cfg->fc_metric == 0)
1278                 cfg->fc_metric = IP6_RT_PRIO_USER;
1279
1280         err = -ENOBUFS;
1281         if (cfg->fc_nlinfo.nlh &&
1282             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283                 table = fib6_get_table(net, cfg->fc_table);
1284                 if (!table) {
1285                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1286                         table = fib6_new_table(net, cfg->fc_table);
1287                 }
1288         } else {
1289                 table = fib6_new_table(net, cfg->fc_table);
1290         }
1291
1292         if (!table)
1293                 goto out;
1294
1295         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1296
1297         if (!rt) {
1298                 err = -ENOMEM;
1299                 goto out;
1300         }
1301
1302         rt->dst.obsolete = -1;
1303
1304         if (cfg->fc_flags & RTF_EXPIRES)
1305                 rt6_set_expires(rt, jiffies +
1306                                 clock_t_to_jiffies(cfg->fc_expires));
1307         else
1308                 rt6_clean_expires(rt);
1309
1310         if (cfg->fc_protocol == RTPROT_UNSPEC)
1311                 cfg->fc_protocol = RTPROT_BOOT;
1312         rt->rt6i_protocol = cfg->fc_protocol;
1313
1314         addr_type = ipv6_addr_type(&cfg->fc_dst);
1315
1316         if (addr_type & IPV6_ADDR_MULTICAST)
1317                 rt->dst.input = ip6_mc_input;
1318         else if (cfg->fc_flags & RTF_LOCAL)
1319                 rt->dst.input = ip6_input;
1320         else
1321                 rt->dst.input = ip6_forward;
1322
1323         rt->dst.output = ip6_output;
1324
1325         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326         rt->rt6i_dst.plen = cfg->fc_dst_len;
1327         if (rt->rt6i_dst.plen == 128)
1328                rt->dst.flags |= DST_HOST;
1329
1330         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1332                 if (!metrics) {
1333                         err = -ENOMEM;
1334                         goto out;
1335                 }
1336                 dst_init_metrics(&rt->dst, metrics, 0);
1337         }
1338 #ifdef CONFIG_IPV6_SUBTREES
1339         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340         rt->rt6i_src.plen = cfg->fc_src_len;
1341 #endif
1342
1343         rt->rt6i_metric = cfg->fc_metric;
1344
1345         /* We cannot add true routes via loopback here,
1346            they would result in kernel looping; promote them to reject routes
1347          */
1348         if ((cfg->fc_flags & RTF_REJECT) ||
1349             (dev && (dev->flags & IFF_LOOPBACK) &&
1350              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351              !(cfg->fc_flags & RTF_LOCAL))) {
1352                 /* hold loopback dev/idev if we haven't done so. */
1353                 if (dev != net->loopback_dev) {
1354                         if (dev) {
1355                                 dev_put(dev);
1356                                 in6_dev_put(idev);
1357                         }
1358                         dev = net->loopback_dev;
1359                         dev_hold(dev);
1360                         idev = in6_dev_get(dev);
1361                         if (!idev) {
1362                                 err = -ENODEV;
1363                                 goto out;
1364                         }
1365                 }
1366                 rt->dst.output = ip6_pkt_discard_out;
1367                 rt->dst.input = ip6_pkt_discard;
1368                 rt->dst.error = -ENETUNREACH;
1369                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1370                 goto install_route;
1371         }
1372
1373         if (cfg->fc_flags & RTF_GATEWAY) {
1374                 const struct in6_addr *gw_addr;
1375                 int gwa_type;
1376
1377                 gw_addr = &cfg->fc_gateway;
1378                 rt->rt6i_gateway = *gw_addr;
1379                 gwa_type = ipv6_addr_type(gw_addr);
1380
1381                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382                         struct rt6_info *grt;
1383
1384                         /* IPv6 strictly inhibits using not link-local
1385                            addresses as nexthop address.
1386                            Otherwise, router will not able to send redirects.
1387                            It is very good, but in some (rare!) circumstances
1388                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1389                            some exceptions. --ANK
1390                          */
1391                         err = -EINVAL;
1392                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1393                                 goto out;
1394
1395                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1396
1397                         err = -EHOSTUNREACH;
1398                         if (!grt)
1399                                 goto out;
1400                         if (dev) {
1401                                 if (dev != grt->dst.dev) {
1402                                         dst_release(&grt->dst);
1403                                         goto out;
1404                                 }
1405                         } else {
1406                                 dev = grt->dst.dev;
1407                                 idev = grt->rt6i_idev;
1408                                 dev_hold(dev);
1409                                 in6_dev_hold(grt->rt6i_idev);
1410                         }
1411                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1412                                 err = 0;
1413                         dst_release(&grt->dst);
1414
1415                         if (err)
1416                                 goto out;
1417                 }
1418                 err = -EINVAL;
1419                 if (!dev || (dev->flags & IFF_LOOPBACK))
1420                         goto out;
1421         }
1422
1423         err = -ENODEV;
1424         if (!dev)
1425                 goto out;
1426
1427         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1429                         err = -EINVAL;
1430                         goto out;
1431                 }
1432                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433                 rt->rt6i_prefsrc.plen = 128;
1434         } else
1435                 rt->rt6i_prefsrc.plen = 0;
1436
1437         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438                 err = rt6_bind_neighbour(rt, dev);
1439                 if (err)
1440                         goto out;
1441         }
1442
1443         rt->rt6i_flags = cfg->fc_flags;
1444
1445 install_route:
1446         if (cfg->fc_mx) {
1447                 struct nlattr *nla;
1448                 int remaining;
1449
1450                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451                         int type = nla_type(nla);
1452
1453                         if (type) {
1454                                 if (type > RTAX_MAX) {
1455                                         err = -EINVAL;
1456                                         goto out;
1457                                 }
1458
1459                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1460                         }
1461                 }
1462         }
1463
1464         rt->dst.dev = dev;
1465         rt->rt6i_idev = idev;
1466         rt->rt6i_table = table;
1467
1468         cfg->fc_nlinfo.nl_net = dev_net(dev);
1469
1470         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1471
1472 out:
1473         if (dev)
1474                 dev_put(dev);
1475         if (idev)
1476                 in6_dev_put(idev);
1477         if (rt)
1478                 dst_free(&rt->dst);
1479         return err;
1480 }
1481
1482 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483 {
1484         int err;
1485         struct fib6_table *table;
1486         struct net *net = dev_net(rt->dst.dev);
1487
1488         if (rt == net->ipv6.ip6_null_entry)
1489                 return -ENOENT;
1490
1491         table = rt->rt6i_table;
1492         write_lock_bh(&table->tb6_lock);
1493
1494         err = fib6_del(rt, info);
1495         dst_release(&rt->dst);
1496
1497         write_unlock_bh(&table->tb6_lock);
1498
1499         return err;
1500 }
1501
1502 int ip6_del_rt(struct rt6_info *rt)
1503 {
1504         struct nl_info info = {
1505                 .nl_net = dev_net(rt->dst.dev),
1506         };
1507         return __ip6_del_rt(rt, &info);
1508 }
1509
1510 static int ip6_route_del(struct fib6_config *cfg)
1511 {
1512         struct fib6_table *table;
1513         struct fib6_node *fn;
1514         struct rt6_info *rt;
1515         int err = -ESRCH;
1516
1517         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1518         if (!table)
1519                 return err;
1520
1521         read_lock_bh(&table->tb6_lock);
1522
1523         fn = fib6_locate(&table->tb6_root,
1524                          &cfg->fc_dst, cfg->fc_dst_len,
1525                          &cfg->fc_src, cfg->fc_src_len);
1526
1527         if (fn) {
1528                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1529                         if (cfg->fc_ifindex &&
1530                             (!rt->dst.dev ||
1531                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1532                                 continue;
1533                         if (cfg->fc_flags & RTF_GATEWAY &&
1534                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1535                                 continue;
1536                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1537                                 continue;
1538                         dst_hold(&rt->dst);
1539                         read_unlock_bh(&table->tb6_lock);
1540
1541                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1542                 }
1543         }
1544         read_unlock_bh(&table->tb6_lock);
1545
1546         return err;
1547 }
1548
1549 /*
1550  *      Handle redirects
1551  */
1552 struct ip6rd_flowi {
1553         struct flowi6 fl6;
1554         struct in6_addr gateway;
1555 };
1556
1557 static struct rt6_info *__ip6_route_redirect(struct net *net,
1558                                              struct fib6_table *table,
1559                                              struct flowi6 *fl6,
1560                                              int flags)
1561 {
1562         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1563         struct rt6_info *rt;
1564         struct fib6_node *fn;
1565
1566         /*
1567          * Get the "current" route for this destination and
1568          * check if the redirect has come from approriate router.
1569          *
1570          * RFC 2461 specifies that redirects should only be
1571          * accepted if they come from the nexthop to the target.
1572          * Due to the way the routes are chosen, this notion
1573          * is a bit fuzzy and one might need to check all possible
1574          * routes.
1575          */
1576
1577         read_lock_bh(&table->tb6_lock);
1578         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1579 restart:
1580         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1581                 /*
1582                  * Current route is on-link; redirect is always invalid.
1583                  *
1584                  * Seems, previous statement is not true. It could
1585                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1586                  * But then router serving it might decide, that we should
1587                  * know truth 8)8) --ANK (980726).
1588                  */
1589                 if (rt6_check_expired(rt))
1590                         continue;
1591                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1592                         continue;
1593                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1594                         continue;
1595                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1596                         continue;
1597                 break;
1598         }
1599
1600         if (!rt)
1601                 rt = net->ipv6.ip6_null_entry;
1602         BACKTRACK(net, &fl6->saddr);
1603 out:
1604         dst_hold(&rt->dst);
1605
1606         read_unlock_bh(&table->tb6_lock);
1607
1608         return rt;
1609 };
1610
1611 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1612                                            const struct in6_addr *src,
1613                                            const struct in6_addr *gateway,
1614                                            struct net_device *dev)
1615 {
1616         int flags = RT6_LOOKUP_F_HAS_SADDR;
1617         struct net *net = dev_net(dev);
1618         struct ip6rd_flowi rdfl = {
1619                 .fl6 = {
1620                         .flowi6_oif = dev->ifindex,
1621                         .daddr = *dest,
1622                         .saddr = *src,
1623                 },
1624         };
1625
1626         rdfl.gateway = *gateway;
1627
1628         if (rt6_need_strict(dest))
1629                 flags |= RT6_LOOKUP_F_IFACE;
1630
1631         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1632                                                    flags, __ip6_route_redirect);
1633 }
1634
1635 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1636                   const struct in6_addr *saddr,
1637                   struct neighbour *neigh, u8 *lladdr, int on_link)
1638 {
1639         struct rt6_info *rt, *nrt = NULL;
1640         struct netevent_redirect netevent;
1641         struct net *net = dev_net(neigh->dev);
1642
1643         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1644
1645         if (rt == net->ipv6.ip6_null_entry) {
1646                 if (net_ratelimit())
1647                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1648                                "for redirect target\n");
1649                 goto out;
1650         }
1651
1652         /*
1653          *      We have finally decided to accept it.
1654          */
1655
1656         neigh_update(neigh, lladdr, NUD_STALE,
1657                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1658                      NEIGH_UPDATE_F_OVERRIDE|
1659                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1660                                      NEIGH_UPDATE_F_ISROUTER))
1661                      );
1662
1663         /*
1664          * Redirect received -> path was valid.
1665          * Look, redirects are sent only in response to data packets,
1666          * so that this nexthop apparently is reachable. --ANK
1667          */
1668         dst_confirm(&rt->dst);
1669
1670         /* Duplicate redirect: silently ignore. */
1671         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1672                 goto out;
1673
1674         nrt = ip6_rt_copy(rt, dest);
1675         if (!nrt)
1676                 goto out;
1677
1678         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1679         if (on_link)
1680                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1681
1682         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1683         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1684
1685         if (ip6_ins_rt(nrt))
1686                 goto out;
1687
1688         netevent.old = &rt->dst;
1689         netevent.new = &nrt->dst;
1690         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1691
1692         if (rt->rt6i_flags & RTF_CACHE) {
1693                 ip6_del_rt(rt);
1694                 return;
1695         }
1696
1697 out:
1698         dst_release(&rt->dst);
1699 }
1700
1701 /*
1702  *      Handle ICMP "packet too big" messages
1703  *      i.e. Path MTU discovery
1704  */
1705
1706 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1707                              struct net *net, u32 pmtu, int ifindex)
1708 {
1709         struct rt6_info *rt, *nrt;
1710         int allfrag = 0;
1711 again:
1712         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1713         if (!rt)
1714                 return;
1715
1716         if (rt6_check_expired(rt)) {
1717                 ip6_del_rt(rt);
1718                 goto again;
1719         }
1720
1721         if (pmtu >= dst_mtu(&rt->dst))
1722                 goto out;
1723
1724         if (pmtu < IPV6_MIN_MTU) {
1725                 /*
1726                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1727                  * MTU (1280) and a fragment header should always be included
1728                  * after a node receiving Too Big message reporting PMTU is
1729                  * less than the IPv6 Minimum Link MTU.
1730                  */
1731                 pmtu = IPV6_MIN_MTU;
1732                 allfrag = 1;
1733         }
1734
1735         /* New mtu received -> path was valid.
1736            They are sent only in response to data packets,
1737            so that this nexthop apparently is reachable. --ANK
1738          */
1739         dst_confirm(&rt->dst);
1740
1741         /* Host route. If it is static, it would be better
1742            not to override it, but add new one, so that
1743            when cache entry will expire old pmtu
1744            would return automatically.
1745          */
1746         if (rt->rt6i_flags & RTF_CACHE) {
1747                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1748                 if (allfrag) {
1749                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1750                         features |= RTAX_FEATURE_ALLFRAG;
1751                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1752                 }
1753                 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1754                 rt->rt6i_flags |= RTF_MODIFIED;
1755                 goto out;
1756         }
1757
1758         /* Network route.
1759            Two cases are possible:
1760            1. It is connected route. Action: COW
1761            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1762          */
1763         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1764                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1765         else
1766                 nrt = rt6_alloc_clone(rt, daddr);
1767
1768         if (nrt) {
1769                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1770                 if (allfrag) {
1771                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1772                         features |= RTAX_FEATURE_ALLFRAG;
1773                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1774                 }
1775
1776                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1777                  * happened within 5 mins, the recommended timer is 10 mins.
1778                  * Here this route expiration time is set to ip6_rt_mtu_expires
1779                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1780                  * and detecting PMTU increase will be automatically happened.
1781                  */
1782                 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1783                 nrt->rt6i_flags |= RTF_DYNAMIC;
1784                 ip6_ins_rt(nrt);
1785         }
1786 out:
1787         dst_release(&rt->dst);
1788 }
1789
1790 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1791                         struct net_device *dev, u32 pmtu)
1792 {
1793         struct net *net = dev_net(dev);
1794
1795         /*
1796          * RFC 1981 states that a node "MUST reduce the size of the packets it
1797          * is sending along the path" that caused the Packet Too Big message.
1798          * Since it's not possible in the general case to determine which
1799          * interface was used to send the original packet, we update the MTU
1800          * on the interface that will be used to send future packets. We also
1801          * update the MTU on the interface that received the Packet Too Big in
1802          * case the original packet was forced out that interface with
1803          * SO_BINDTODEVICE or similar. This is the next best thing to the
1804          * correct behaviour, which would be to update the MTU on all
1805          * interfaces.
1806          */
1807         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1808         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1809 }
1810
1811 /*
1812  *      Misc support functions
1813  */
1814
1815 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1816                                     const struct in6_addr *dest)
1817 {
1818         struct net *net = dev_net(ort->dst.dev);
1819         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1820                                             ort->dst.dev, 0);
1821
1822         if (rt) {
1823                 rt->dst.input = ort->dst.input;
1824                 rt->dst.output = ort->dst.output;
1825                 rt->dst.flags |= DST_HOST;
1826
1827                 rt->rt6i_dst.addr = *dest;
1828                 rt->rt6i_dst.plen = 128;
1829                 dst_copy_metrics(&rt->dst, &ort->dst);
1830                 rt->dst.error = ort->dst.error;
1831                 rt->rt6i_idev = ort->rt6i_idev;
1832                 if (rt->rt6i_idev)
1833                         in6_dev_hold(rt->rt6i_idev);
1834                 rt->dst.lastuse = jiffies;
1835
1836                 rt->rt6i_gateway = ort->rt6i_gateway;
1837                 rt->rt6i_flags = ort->rt6i_flags;
1838                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1839                     (RTF_DEFAULT | RTF_ADDRCONF))
1840                         rt6_set_from(rt, ort);
1841                 else
1842                         rt6_clean_expires(rt);
1843                 rt->rt6i_metric = 0;
1844
1845 #ifdef CONFIG_IPV6_SUBTREES
1846                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1847 #endif
1848                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1849                 rt->rt6i_table = ort->rt6i_table;
1850         }
1851         return rt;
1852 }
1853
1854 #ifdef CONFIG_IPV6_ROUTE_INFO
1855 static struct rt6_info *rt6_get_route_info(struct net *net,
1856                                            const struct in6_addr *prefix, int prefixlen,
1857                                            const struct in6_addr *gwaddr, int ifindex)
1858 {
1859         struct fib6_node *fn;
1860         struct rt6_info *rt = NULL;
1861         struct fib6_table *table;
1862
1863         table = fib6_get_table(net, RT6_TABLE_INFO);
1864         if (!table)
1865                 return NULL;
1866
1867         write_lock_bh(&table->tb6_lock);
1868         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1869         if (!fn)
1870                 goto out;
1871
1872         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1873                 if (rt->dst.dev->ifindex != ifindex)
1874                         continue;
1875                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1876                         continue;
1877                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1878                         continue;
1879                 dst_hold(&rt->dst);
1880                 break;
1881         }
1882 out:
1883         write_unlock_bh(&table->tb6_lock);
1884         return rt;
1885 }
1886
1887 static struct rt6_info *rt6_add_route_info(struct net *net,
1888                                            const struct in6_addr *prefix, int prefixlen,
1889                                            const struct in6_addr *gwaddr, int ifindex,
1890                                            unsigned pref)
1891 {
1892         struct fib6_config cfg = {
1893                 .fc_table       = RT6_TABLE_INFO,
1894                 .fc_metric      = IP6_RT_PRIO_USER,
1895                 .fc_ifindex     = ifindex,
1896                 .fc_dst_len     = prefixlen,
1897                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1898                                   RTF_UP | RTF_PREF(pref),
1899                 .fc_nlinfo.pid = 0,
1900                 .fc_nlinfo.nlh = NULL,
1901                 .fc_nlinfo.nl_net = net,
1902         };
1903
1904         cfg.fc_dst = *prefix;
1905         cfg.fc_gateway = *gwaddr;
1906
1907         /* We should treat it as a default route if prefix length is 0. */
1908         if (!prefixlen)
1909                 cfg.fc_flags |= RTF_DEFAULT;
1910
1911         ip6_route_add(&cfg);
1912
1913         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1914 }
1915 #endif
1916
1917 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1918 {
1919         struct rt6_info *rt;
1920         struct fib6_table *table;
1921
1922         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1923         if (!table)
1924                 return NULL;
1925
1926         write_lock_bh(&table->tb6_lock);
1927         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1928                 if (dev == rt->dst.dev &&
1929                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1930                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1931                         break;
1932         }
1933         if (rt)
1934                 dst_hold(&rt->dst);
1935         write_unlock_bh(&table->tb6_lock);
1936         return rt;
1937 }
1938
1939 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1940                                      struct net_device *dev,
1941                                      unsigned int pref)
1942 {
1943         struct fib6_config cfg = {
1944                 .fc_table       = RT6_TABLE_DFLT,
1945                 .fc_metric      = IP6_RT_PRIO_USER,
1946                 .fc_ifindex     = dev->ifindex,
1947                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1948                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1949                 .fc_nlinfo.pid = 0,
1950                 .fc_nlinfo.nlh = NULL,
1951                 .fc_nlinfo.nl_net = dev_net(dev),
1952         };
1953
1954         cfg.fc_gateway = *gwaddr;
1955
1956         ip6_route_add(&cfg);
1957
1958         return rt6_get_dflt_router(gwaddr, dev);
1959 }
1960
1961 void rt6_purge_dflt_routers(struct net *net)
1962 {
1963         struct rt6_info *rt;
1964         struct fib6_table *table;
1965
1966         /* NOTE: Keep consistent with rt6_get_dflt_router */
1967         table = fib6_get_table(net, RT6_TABLE_DFLT);
1968         if (!table)
1969                 return;
1970
1971 restart:
1972         read_lock_bh(&table->tb6_lock);
1973         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1974                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1975                         dst_hold(&rt->dst);
1976                         read_unlock_bh(&table->tb6_lock);
1977                         ip6_del_rt(rt);
1978                         goto restart;
1979                 }
1980         }
1981         read_unlock_bh(&table->tb6_lock);
1982 }
1983
1984 static void rtmsg_to_fib6_config(struct net *net,
1985                                  struct in6_rtmsg *rtmsg,
1986                                  struct fib6_config *cfg)
1987 {
1988         memset(cfg, 0, sizeof(*cfg));
1989
1990         cfg->fc_table = RT6_TABLE_MAIN;
1991         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1992         cfg->fc_metric = rtmsg->rtmsg_metric;
1993         cfg->fc_expires = rtmsg->rtmsg_info;
1994         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1995         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1996         cfg->fc_flags = rtmsg->rtmsg_flags;
1997
1998         cfg->fc_nlinfo.nl_net = net;
1999
2000         cfg->fc_dst = rtmsg->rtmsg_dst;
2001         cfg->fc_src = rtmsg->rtmsg_src;
2002         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2003 }
2004
2005 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2006 {
2007         struct fib6_config cfg;
2008         struct in6_rtmsg rtmsg;
2009         int err;
2010
2011         switch(cmd) {
2012         case SIOCADDRT:         /* Add a route */
2013         case SIOCDELRT:         /* Delete a route */
2014                 if (!capable(CAP_NET_ADMIN))
2015                         return -EPERM;
2016                 err = copy_from_user(&rtmsg, arg,
2017                                      sizeof(struct in6_rtmsg));
2018                 if (err)
2019                         return -EFAULT;
2020
2021                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2022
2023                 rtnl_lock();
2024                 switch (cmd) {
2025                 case SIOCADDRT:
2026                         err = ip6_route_add(&cfg);
2027                         break;
2028                 case SIOCDELRT:
2029                         err = ip6_route_del(&cfg);
2030                         break;
2031                 default:
2032                         err = -EINVAL;
2033                 }
2034                 rtnl_unlock();
2035
2036                 return err;
2037         }
2038
2039         return -EINVAL;
2040 }
2041
2042 /*
2043  *      Drop the packet on the floor
2044  */
2045
2046 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2047 {
2048         int type;
2049         struct dst_entry *dst = skb_dst(skb);
2050         switch (ipstats_mib_noroutes) {
2051         case IPSTATS_MIB_INNOROUTES:
2052                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2053                 if (type == IPV6_ADDR_ANY) {
2054                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2055                                       IPSTATS_MIB_INADDRERRORS);
2056                         break;
2057                 }
2058                 /* FALLTHROUGH */
2059         case IPSTATS_MIB_OUTNOROUTES:
2060                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2061                               ipstats_mib_noroutes);
2062                 break;
2063         }
2064         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2065         kfree_skb(skb);
2066         return 0;
2067 }
2068
2069 static int ip6_pkt_discard(struct sk_buff *skb)
2070 {
2071         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2072 }
2073
2074 static int ip6_pkt_discard_out(struct sk_buff *skb)
2075 {
2076         skb->dev = skb_dst(skb)->dev;
2077         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2078 }
2079
2080 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2081
2082 static int ip6_pkt_prohibit(struct sk_buff *skb)
2083 {
2084         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2085 }
2086
2087 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2088 {
2089         skb->dev = skb_dst(skb)->dev;
2090         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2091 }
2092
2093 #endif
2094
2095 /*
2096  *      Allocate a dst for local (unicast / anycast) address.
2097  */
2098
2099 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2100                                     const struct in6_addr *addr,
2101                                     bool anycast)
2102 {
2103         struct net *net = dev_net(idev->dev);
2104         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2105                                             net->loopback_dev, 0);
2106         int err;
2107
2108         if (!rt) {
2109                 if (net_ratelimit())
2110                         pr_warning("IPv6:  Maximum number of routes reached,"
2111                                    " consider increasing route/max_size.\n");
2112                 return ERR_PTR(-ENOMEM);
2113         }
2114
2115         in6_dev_hold(idev);
2116
2117         rt->dst.flags |= DST_HOST;
2118         rt->dst.input = ip6_input;
2119         rt->dst.output = ip6_output;
2120         rt->rt6i_idev = idev;
2121         rt->dst.obsolete = -1;
2122
2123         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2124         if (anycast)
2125                 rt->rt6i_flags |= RTF_ANYCAST;
2126         else
2127                 rt->rt6i_flags |= RTF_LOCAL;
2128         err = rt6_bind_neighbour(rt, rt->dst.dev);
2129         if (err) {
2130                 dst_free(&rt->dst);
2131                 return ERR_PTR(err);
2132         }
2133
2134         rt->rt6i_dst.addr = *addr;
2135         rt->rt6i_dst.plen = 128;
2136         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2137
2138         atomic_set(&rt->dst.__refcnt, 1);
2139
2140         return rt;
2141 }
2142
2143 int ip6_route_get_saddr(struct net *net,
2144                         struct rt6_info *rt,
2145                         const struct in6_addr *daddr,
2146                         unsigned int prefs,
2147                         struct in6_addr *saddr)
2148 {
2149         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2150         int err = 0;
2151         if (rt->rt6i_prefsrc.plen)
2152                 *saddr = rt->rt6i_prefsrc.addr;
2153         else
2154                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2155                                          daddr, prefs, saddr);
2156         return err;
2157 }
2158
2159 /* remove deleted ip from prefsrc entries */
2160 struct arg_dev_net_ip {
2161         struct net_device *dev;
2162         struct net *net;
2163         struct in6_addr *addr;
2164 };
2165
2166 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2167 {
2168         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2169         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2170         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2171
2172         if (((void *)rt->dst.dev == dev || !dev) &&
2173             rt != net->ipv6.ip6_null_entry &&
2174             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2175                 /* remove prefsrc entry */
2176                 rt->rt6i_prefsrc.plen = 0;
2177         }
2178         return 0;
2179 }
2180
2181 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2182 {
2183         struct net *net = dev_net(ifp->idev->dev);
2184         struct arg_dev_net_ip adni = {
2185                 .dev = ifp->idev->dev,
2186                 .net = net,
2187                 .addr = &ifp->addr,
2188         };
2189         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2190 }
2191
2192 struct arg_dev_net {
2193         struct net_device *dev;
2194         struct net *net;
2195 };
2196
2197 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2198 {
2199         const struct arg_dev_net *adn = arg;
2200         const struct net_device *dev = adn->dev;
2201
2202         if ((rt->dst.dev == dev || !dev) &&
2203             rt != adn->net->ipv6.ip6_null_entry)
2204                 return -1;
2205
2206         return 0;
2207 }
2208
2209 void rt6_ifdown(struct net *net, struct net_device *dev)
2210 {
2211         struct arg_dev_net adn = {
2212                 .dev = dev,
2213                 .net = net,
2214         };
2215
2216         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2217         icmp6_clean_all(fib6_ifdown, &adn);
2218 }
2219
2220 struct rt6_mtu_change_arg
2221 {
2222         struct net_device *dev;
2223         unsigned mtu;
2224 };
2225
2226 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2227 {
2228         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2229         struct inet6_dev *idev;
2230
2231         /* In IPv6 pmtu discovery is not optional,
2232            so that RTAX_MTU lock cannot disable it.
2233            We still use this lock to block changes
2234            caused by addrconf/ndisc.
2235         */
2236
2237         idev = __in6_dev_get(arg->dev);
2238         if (!idev)
2239                 return 0;
2240
2241         /* For administrative MTU increase, there is no way to discover
2242            IPv6 PMTU increase, so PMTU increase should be updated here.
2243            Since RFC 1981 doesn't include administrative MTU increase
2244            update PMTU increase is a MUST. (i.e. jumbo frame)
2245          */
2246         /*
2247            If new MTU is less than route PMTU, this new MTU will be the
2248            lowest MTU in the path, update the route PMTU to reflect PMTU
2249            decreases; if new MTU is greater than route PMTU, and the
2250            old MTU is the lowest MTU in the path, update the route PMTU
2251            to reflect the increase. In this case if the other nodes' MTU
2252            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2253            PMTU discouvery.
2254          */
2255         if (rt->dst.dev == arg->dev &&
2256             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2257             (dst_mtu(&rt->dst) >= arg->mtu ||
2258              (dst_mtu(&rt->dst) < arg->mtu &&
2259               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2260                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2261         }
2262         return 0;
2263 }
2264
2265 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2266 {
2267         struct rt6_mtu_change_arg arg = {
2268                 .dev = dev,
2269                 .mtu = mtu,
2270         };
2271
2272         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2273 }
2274
2275 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2276         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2277         [RTA_OIF]               = { .type = NLA_U32 },
2278         [RTA_IIF]               = { .type = NLA_U32 },
2279         [RTA_PRIORITY]          = { .type = NLA_U32 },
2280         [RTA_METRICS]           = { .type = NLA_NESTED },
2281 };
2282
2283 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2284                               struct fib6_config *cfg)
2285 {
2286         struct rtmsg *rtm;
2287         struct nlattr *tb[RTA_MAX+1];
2288         int err;
2289
2290         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2291         if (err < 0)
2292                 goto errout;
2293
2294         err = -EINVAL;
2295         rtm = nlmsg_data(nlh);
2296         memset(cfg, 0, sizeof(*cfg));
2297
2298         cfg->fc_table = rtm->rtm_table;
2299         cfg->fc_dst_len = rtm->rtm_dst_len;
2300         cfg->fc_src_len = rtm->rtm_src_len;
2301         cfg->fc_flags = RTF_UP;
2302         cfg->fc_protocol = rtm->rtm_protocol;
2303
2304         if (rtm->rtm_type == RTN_UNREACHABLE)
2305                 cfg->fc_flags |= RTF_REJECT;
2306
2307         if (rtm->rtm_type == RTN_LOCAL)
2308                 cfg->fc_flags |= RTF_LOCAL;
2309
2310         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2311         cfg->fc_nlinfo.nlh = nlh;
2312         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2313
2314         if (tb[RTA_GATEWAY]) {
2315                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2316                 cfg->fc_flags |= RTF_GATEWAY;
2317         }
2318
2319         if (tb[RTA_DST]) {
2320                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2321
2322                 if (nla_len(tb[RTA_DST]) < plen)
2323                         goto errout;
2324
2325                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2326         }
2327
2328         if (tb[RTA_SRC]) {
2329                 int plen = (rtm->rtm_src_len + 7) >> 3;
2330
2331                 if (nla_len(tb[RTA_SRC]) < plen)
2332                         goto errout;
2333
2334                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2335         }
2336
2337         if (tb[RTA_PREFSRC])
2338                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2339
2340         if (tb[RTA_OIF])
2341                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2342
2343         if (tb[RTA_PRIORITY])
2344                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2345
2346         if (tb[RTA_METRICS]) {
2347                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2348                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2349         }
2350
2351         if (tb[RTA_TABLE])
2352                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2353
2354         err = 0;
2355 errout:
2356         return err;
2357 }
2358
2359 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2360 {
2361         struct fib6_config cfg;
2362         int err;
2363
2364         err = rtm_to_fib6_config(skb, nlh, &cfg);
2365         if (err < 0)
2366                 return err;
2367
2368         return ip6_route_del(&cfg);
2369 }
2370
2371 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2372 {
2373         struct fib6_config cfg;
2374         int err;
2375
2376         err = rtm_to_fib6_config(skb, nlh, &cfg);
2377         if (err < 0)
2378                 return err;
2379
2380         return ip6_route_add(&cfg);
2381 }
2382
2383 static inline size_t rt6_nlmsg_size(void)
2384 {
2385         return NLMSG_ALIGN(sizeof(struct rtmsg))
2386                + nla_total_size(16) /* RTA_SRC */
2387                + nla_total_size(16) /* RTA_DST */
2388                + nla_total_size(16) /* RTA_GATEWAY */
2389                + nla_total_size(16) /* RTA_PREFSRC */
2390                + nla_total_size(4) /* RTA_TABLE */
2391                + nla_total_size(4) /* RTA_IIF */
2392                + nla_total_size(4) /* RTA_OIF */
2393                + nla_total_size(4) /* RTA_PRIORITY */
2394                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2395                + nla_total_size(sizeof(struct rta_cacheinfo));
2396 }
2397
2398 static int rt6_fill_node(struct net *net,
2399                          struct sk_buff *skb, struct rt6_info *rt,
2400                          struct in6_addr *dst, struct in6_addr *src,
2401                          int iif, int type, u32 pid, u32 seq,
2402                          int prefix, int nowait, unsigned int flags)
2403 {
2404         const struct inet_peer *peer;
2405         struct rtmsg *rtm;
2406         struct nlmsghdr *nlh;
2407         long expires;
2408         u32 table;
2409         struct neighbour *n;
2410         u32 ts, tsage;
2411
2412         if (prefix) {   /* user wants prefix routes only */
2413                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2414                         /* success since this is not a prefix route */
2415                         return 1;
2416                 }
2417         }
2418
2419         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2420         if (!nlh)
2421                 return -EMSGSIZE;
2422
2423         rtm = nlmsg_data(nlh);
2424         rtm->rtm_family = AF_INET6;
2425         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2426         rtm->rtm_src_len = rt->rt6i_src.plen;
2427         rtm->rtm_tos = 0;
2428         if (rt->rt6i_table)
2429                 table = rt->rt6i_table->tb6_id;
2430         else
2431                 table = RT6_TABLE_UNSPEC;
2432         rtm->rtm_table = table;
2433         NLA_PUT_U32(skb, RTA_TABLE, table);
2434         if (rt->rt6i_flags & RTF_REJECT)
2435                 rtm->rtm_type = RTN_UNREACHABLE;
2436         else if (rt->rt6i_flags & RTF_LOCAL)
2437                 rtm->rtm_type = RTN_LOCAL;
2438         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2439                 rtm->rtm_type = RTN_LOCAL;
2440         else
2441                 rtm->rtm_type = RTN_UNICAST;
2442         rtm->rtm_flags = 0;
2443         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2444         rtm->rtm_protocol = rt->rt6i_protocol;
2445         if (rt->rt6i_flags & RTF_DYNAMIC)
2446                 rtm->rtm_protocol = RTPROT_REDIRECT;
2447         else if (rt->rt6i_flags & RTF_ADDRCONF)
2448                 rtm->rtm_protocol = RTPROT_KERNEL;
2449         else if (rt->rt6i_flags & RTF_DEFAULT)
2450                 rtm->rtm_protocol = RTPROT_RA;
2451
2452         if (rt->rt6i_flags & RTF_CACHE)
2453                 rtm->rtm_flags |= RTM_F_CLONED;
2454
2455         if (dst) {
2456                 NLA_PUT(skb, RTA_DST, 16, dst);
2457                 rtm->rtm_dst_len = 128;
2458         } else if (rtm->rtm_dst_len)
2459                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2460 #ifdef CONFIG_IPV6_SUBTREES
2461         if (src) {
2462                 NLA_PUT(skb, RTA_SRC, 16, src);
2463                 rtm->rtm_src_len = 128;
2464         } else if (rtm->rtm_src_len)
2465                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2466 #endif
2467         if (iif) {
2468 #ifdef CONFIG_IPV6_MROUTE
2469                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2470                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2471                         if (err <= 0) {
2472                                 if (!nowait) {
2473                                         if (err == 0)
2474                                                 return 0;
2475                                         goto nla_put_failure;
2476                                 } else {
2477                                         if (err == -EMSGSIZE)
2478                                                 goto nla_put_failure;
2479                                 }
2480                         }
2481                 } else
2482 #endif
2483                         NLA_PUT_U32(skb, RTA_IIF, iif);
2484         } else if (dst) {
2485                 struct in6_addr saddr_buf;
2486                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2487                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2488         }
2489
2490         if (rt->rt6i_prefsrc.plen) {
2491                 struct in6_addr saddr_buf;
2492                 saddr_buf = rt->rt6i_prefsrc.addr;
2493                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2494         }
2495
2496         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2497                 goto nla_put_failure;
2498
2499         rcu_read_lock();
2500         n = dst_get_neighbour_noref(&rt->dst);
2501         if (n) {
2502                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2503                         rcu_read_unlock();
2504                         goto nla_put_failure;
2505                 }
2506         }
2507         rcu_read_unlock();
2508
2509         if (rt->dst.dev)
2510                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2511
2512         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2513
2514         if (!(rt->rt6i_flags & RTF_EXPIRES))
2515                 expires = 0;
2516         else if (rt->dst.expires - jiffies < INT_MAX)
2517                 expires = rt->dst.expires - jiffies;
2518         else
2519                 expires = INT_MAX;
2520
2521         peer = rt->rt6i_peer;
2522         ts = tsage = 0;
2523         if (peer && peer->tcp_ts_stamp) {
2524                 ts = peer->tcp_ts;
2525                 tsage = get_seconds() - peer->tcp_ts_stamp;
2526         }
2527
2528         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2529                                expires, rt->dst.error) < 0)
2530                 goto nla_put_failure;
2531
2532         return nlmsg_end(skb, nlh);
2533
2534 nla_put_failure:
2535         nlmsg_cancel(skb, nlh);
2536         return -EMSGSIZE;
2537 }
2538
2539 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2540 {
2541         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2542         int prefix;
2543
2544         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2545                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2546                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2547         } else
2548                 prefix = 0;
2549
2550         return rt6_fill_node(arg->net,
2551                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2552                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2553                      prefix, 0, NLM_F_MULTI);
2554 }
2555
2556 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2557 {
2558         struct net *net = sock_net(in_skb->sk);
2559         struct nlattr *tb[RTA_MAX+1];
2560         struct rt6_info *rt;
2561         struct sk_buff *skb;
2562         struct rtmsg *rtm;
2563         struct flowi6 fl6;
2564         int err, iif = 0, oif = 0;
2565
2566         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2567         if (err < 0)
2568                 goto errout;
2569
2570         err = -EINVAL;
2571         memset(&fl6, 0, sizeof(fl6));
2572
2573         if (tb[RTA_SRC]) {
2574                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2575                         goto errout;
2576
2577                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2578         }
2579
2580         if (tb[RTA_DST]) {
2581                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2582                         goto errout;
2583
2584                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2585         }
2586
2587         if (tb[RTA_IIF])
2588                 iif = nla_get_u32(tb[RTA_IIF]);
2589
2590         if (tb[RTA_OIF])
2591                 oif = nla_get_u32(tb[RTA_OIF]);
2592
2593         if (iif) {
2594                 struct net_device *dev;
2595                 int flags = 0;
2596
2597                 dev = __dev_get_by_index(net, iif);
2598                 if (!dev) {
2599                         err = -ENODEV;
2600                         goto errout;
2601                 }
2602
2603                 fl6.flowi6_iif = iif;
2604
2605                 if (!ipv6_addr_any(&fl6.saddr))
2606                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2607
2608                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2609                                                                flags);
2610         } else {
2611                 fl6.flowi6_oif = oif;
2612
2613                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2614         }
2615
2616         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2617         if (!skb) {
2618                 err = -ENOBUFS;
2619                 goto errout;
2620         }
2621
2622         /* Reserve room for dummy headers, this skb can pass
2623            through good chunk of routing engine.
2624          */
2625         skb_reset_mac_header(skb);
2626         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2627
2628         skb_dst_set(skb, &rt->dst);
2629
2630         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2631                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2632                             nlh->nlmsg_seq, 0, 0, 0);
2633         if (err < 0) {
2634                 kfree_skb(skb);
2635                 goto errout;
2636         }
2637
2638         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2639 errout:
2640         return err;
2641 }
2642
2643 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2644 {
2645         struct sk_buff *skb;
2646         struct net *net = info->nl_net;
2647         u32 seq;
2648         int err;
2649
2650         err = -ENOBUFS;
2651         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2652
2653         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2654         if (!skb)
2655                 goto errout;
2656
2657         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2658                                 event, info->pid, seq, 0, 0, 0);
2659         if (err < 0) {
2660                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2661                 WARN_ON(err == -EMSGSIZE);
2662                 kfree_skb(skb);
2663                 goto errout;
2664         }
2665         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2666                     info->nlh, gfp_any());
2667         return;
2668 errout:
2669         if (err < 0)
2670                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2671 }
2672
2673 static int ip6_route_dev_notify(struct notifier_block *this,
2674                                 unsigned long event, void *data)
2675 {
2676         struct net_device *dev = (struct net_device *)data;
2677         struct net *net = dev_net(dev);
2678
2679         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2680                 net->ipv6.ip6_null_entry->dst.dev = dev;
2681                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2682 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2683                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2684                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2685                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2686                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2687 #endif
2688         }
2689
2690         return NOTIFY_OK;
2691 }
2692
2693 /*
2694  *      /proc
2695  */
2696
2697 #ifdef CONFIG_PROC_FS
2698
2699 struct rt6_proc_arg
2700 {
2701         char *buffer;
2702         int offset;
2703         int length;
2704         int skip;
2705         int len;
2706 };
2707
2708 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2709 {
2710         struct seq_file *m = p_arg;
2711         struct neighbour *n;
2712
2713         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2714
2715 #ifdef CONFIG_IPV6_SUBTREES
2716         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2717 #else
2718         seq_puts(m, "00000000000000000000000000000000 00 ");
2719 #endif
2720         rcu_read_lock();
2721         n = dst_get_neighbour_noref(&rt->dst);
2722         if (n) {
2723                 seq_printf(m, "%pi6", n->primary_key);
2724         } else {
2725                 seq_puts(m, "00000000000000000000000000000000");
2726         }
2727         rcu_read_unlock();
2728         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2729                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2730                    rt->dst.__use, rt->rt6i_flags,
2731                    rt->dst.dev ? rt->dst.dev->name : "");
2732         return 0;
2733 }
2734
2735 static int ipv6_route_show(struct seq_file *m, void *v)
2736 {
2737         struct net *net = (struct net *)m->private;
2738         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2739         return 0;
2740 }
2741
2742 static int ipv6_route_open(struct inode *inode, struct file *file)
2743 {
2744         return single_open_net(inode, file, ipv6_route_show);
2745 }
2746
2747 static const struct file_operations ipv6_route_proc_fops = {
2748         .owner          = THIS_MODULE,
2749         .open           = ipv6_route_open,
2750         .read           = seq_read,
2751         .llseek         = seq_lseek,
2752         .release        = single_release_net,
2753 };
2754
2755 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2756 {
2757         struct net *net = (struct net *)seq->private;
2758         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2759                    net->ipv6.rt6_stats->fib_nodes,
2760                    net->ipv6.rt6_stats->fib_route_nodes,
2761                    net->ipv6.rt6_stats->fib_rt_alloc,
2762                    net->ipv6.rt6_stats->fib_rt_entries,
2763                    net->ipv6.rt6_stats->fib_rt_cache,
2764                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2765                    net->ipv6.rt6_stats->fib_discarded_routes);
2766
2767         return 0;
2768 }
2769
2770 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2771 {
2772         return single_open_net(inode, file, rt6_stats_seq_show);
2773 }
2774
2775 static const struct file_operations rt6_stats_seq_fops = {
2776         .owner   = THIS_MODULE,
2777         .open    = rt6_stats_seq_open,
2778         .read    = seq_read,
2779         .llseek  = seq_lseek,
2780         .release = single_release_net,
2781 };
2782 #endif  /* CONFIG_PROC_FS */
2783
2784 #ifdef CONFIG_SYSCTL
2785
2786 static
2787 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2788                               void __user *buffer, size_t *lenp, loff_t *ppos)
2789 {
2790         struct net *net;
2791         int delay;
2792         if (!write)
2793                 return -EINVAL;
2794
2795         net = (struct net *)ctl->extra1;
2796         delay = net->ipv6.sysctl.flush_delay;
2797         proc_dointvec(ctl, write, buffer, lenp, ppos);
2798         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2799         return 0;
2800 }
2801
2802 ctl_table ipv6_route_table_template[] = {
2803         {
2804                 .procname       =       "flush",
2805                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2806                 .maxlen         =       sizeof(int),
2807                 .mode           =       0200,
2808                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2809         },
2810         {
2811                 .procname       =       "gc_thresh",
2812                 .data           =       &ip6_dst_ops_template.gc_thresh,
2813                 .maxlen         =       sizeof(int),
2814                 .mode           =       0644,
2815                 .proc_handler   =       proc_dointvec,
2816         },
2817         {
2818                 .procname       =       "max_size",
2819                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2820                 .maxlen         =       sizeof(int),
2821                 .mode           =       0644,
2822                 .proc_handler   =       proc_dointvec,
2823         },
2824         {
2825                 .procname       =       "gc_min_interval",
2826                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2827                 .maxlen         =       sizeof(int),
2828                 .mode           =       0644,
2829                 .proc_handler   =       proc_dointvec_jiffies,
2830         },
2831         {
2832                 .procname       =       "gc_timeout",
2833                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2834                 .maxlen         =       sizeof(int),
2835                 .mode           =       0644,
2836                 .proc_handler   =       proc_dointvec_jiffies,
2837         },
2838         {
2839                 .procname       =       "gc_interval",
2840                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2841                 .maxlen         =       sizeof(int),
2842                 .mode           =       0644,
2843                 .proc_handler   =       proc_dointvec_jiffies,
2844         },
2845         {
2846                 .procname       =       "gc_elasticity",
2847                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2848                 .maxlen         =       sizeof(int),
2849                 .mode           =       0644,
2850                 .proc_handler   =       proc_dointvec,
2851         },
2852         {
2853                 .procname       =       "mtu_expires",
2854                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2855                 .maxlen         =       sizeof(int),
2856                 .mode           =       0644,
2857                 .proc_handler   =       proc_dointvec_jiffies,
2858         },
2859         {
2860                 .procname       =       "min_adv_mss",
2861                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2862                 .maxlen         =       sizeof(int),
2863                 .mode           =       0644,
2864                 .proc_handler   =       proc_dointvec,
2865         },
2866         {
2867                 .procname       =       "gc_min_interval_ms",
2868                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2869                 .maxlen         =       sizeof(int),
2870                 .mode           =       0644,
2871                 .proc_handler   =       proc_dointvec_ms_jiffies,
2872         },
2873         { }
2874 };
2875
2876 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2877 {
2878         struct ctl_table *table;
2879
2880         table = kmemdup(ipv6_route_table_template,
2881                         sizeof(ipv6_route_table_template),
2882                         GFP_KERNEL);
2883
2884         if (table) {
2885                 table[0].data = &net->ipv6.sysctl.flush_delay;
2886                 table[0].extra1 = net;
2887                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2888                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2889                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2890                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2891                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2892                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2893                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2894                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2895                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2896         }
2897
2898         return table;
2899 }
2900 #endif
2901
2902 static int __net_init ip6_route_net_init(struct net *net)
2903 {
2904         int ret = -ENOMEM;
2905
2906         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2907                sizeof(net->ipv6.ip6_dst_ops));
2908
2909         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2910                 goto out_ip6_dst_ops;
2911
2912         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2913                                            sizeof(*net->ipv6.ip6_null_entry),
2914                                            GFP_KERNEL);
2915         if (!net->ipv6.ip6_null_entry)
2916                 goto out_ip6_dst_entries;
2917         net->ipv6.ip6_null_entry->dst.path =
2918                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2919         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2920         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2921                          ip6_template_metrics, true);
2922
2923 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2924         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2925                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2926                                                GFP_KERNEL);
2927         if (!net->ipv6.ip6_prohibit_entry)
2928                 goto out_ip6_null_entry;
2929         net->ipv6.ip6_prohibit_entry->dst.path =
2930                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2931         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2932         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2933                          ip6_template_metrics, true);
2934
2935         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2936                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2937                                                GFP_KERNEL);
2938         if (!net->ipv6.ip6_blk_hole_entry)
2939                 goto out_ip6_prohibit_entry;
2940         net->ipv6.ip6_blk_hole_entry->dst.path =
2941                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2942         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2943         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2944                          ip6_template_metrics, true);
2945 #endif
2946
2947         net->ipv6.sysctl.flush_delay = 0;
2948         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2949         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2950         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2951         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2952         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2953         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2954         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2955
2956 #ifdef CONFIG_PROC_FS
2957         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2958         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2959 #endif
2960         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2961
2962         ret = 0;
2963 out:
2964         return ret;
2965
2966 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2967 out_ip6_prohibit_entry:
2968         kfree(net->ipv6.ip6_prohibit_entry);
2969 out_ip6_null_entry:
2970         kfree(net->ipv6.ip6_null_entry);
2971 #endif
2972 out_ip6_dst_entries:
2973         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2974 out_ip6_dst_ops:
2975         goto out;
2976 }
2977
2978 static void __net_exit ip6_route_net_exit(struct net *net)
2979 {
2980 #ifdef CONFIG_PROC_FS
2981         proc_net_remove(net, "ipv6_route");
2982         proc_net_remove(net, "rt6_stats");
2983 #endif
2984         kfree(net->ipv6.ip6_null_entry);
2985 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2986         kfree(net->ipv6.ip6_prohibit_entry);
2987         kfree(net->ipv6.ip6_blk_hole_entry);
2988 #endif
2989         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2990 }
2991
2992 static struct pernet_operations ip6_route_net_ops = {
2993         .init = ip6_route_net_init,
2994         .exit = ip6_route_net_exit,
2995 };
2996
2997 static struct notifier_block ip6_route_dev_notifier = {
2998         .notifier_call = ip6_route_dev_notify,
2999         .priority = 0,
3000 };
3001
3002 int __init ip6_route_init(void)
3003 {
3004         int ret;
3005
3006         ret = -ENOMEM;
3007         ip6_dst_ops_template.kmem_cachep =
3008                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3009                                   SLAB_HWCACHE_ALIGN, NULL);
3010         if (!ip6_dst_ops_template.kmem_cachep)
3011                 goto out;
3012
3013         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3014         if (ret)
3015                 goto out_kmem_cache;
3016
3017         ret = register_pernet_subsys(&ip6_route_net_ops);
3018         if (ret)
3019                 goto out_dst_entries;
3020
3021         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3022
3023         /* Registering of the loopback is done before this portion of code,
3024          * the loopback reference in rt6_info will not be taken, do it
3025          * manually for init_net */
3026         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3027         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3028   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3029         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3030         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3031         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3032         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3033   #endif
3034         ret = fib6_init();
3035         if (ret)
3036                 goto out_register_subsys;
3037
3038         ret = xfrm6_init();
3039         if (ret)
3040                 goto out_fib6_init;
3041
3042         ret = fib6_rules_init();
3043         if (ret)
3044                 goto xfrm6_init;
3045
3046         ret = -ENOBUFS;
3047         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3048             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3049             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3050                 goto fib6_rules_init;
3051
3052         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3053         if (ret)
3054                 goto fib6_rules_init;
3055
3056 out:
3057         return ret;
3058
3059 fib6_rules_init:
3060         fib6_rules_cleanup();
3061 xfrm6_init:
3062         xfrm6_fini();
3063 out_fib6_init:
3064         fib6_gc_cleanup();
3065 out_register_subsys:
3066         unregister_pernet_subsys(&ip6_route_net_ops);
3067 out_dst_entries:
3068         dst_entries_destroy(&ip6_dst_blackhole_ops);
3069 out_kmem_cache:
3070         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3071         goto out;
3072 }
3073
3074 void ip6_route_cleanup(void)
3075 {
3076         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3077         fib6_rules_cleanup();
3078         xfrm6_fini();
3079         fib6_gc_cleanup();
3080         unregister_pernet_subsys(&ip6_route_net_ops);
3081         dst_entries_destroy(&ip6_dst_blackhole_ops);
3082         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3083 }