Import changeset
[linux-flexiantxendom0-3.2.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.49 2000/11/03 01:11:58 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 #include <linux/config.h>
17 #include <linux/errno.h>
18 #include <linux/types.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/net.h>
22 #include <linux/route.h>
23 #include <linux/netdevice.h>
24 #include <linux/in6.h>
25 #include <linux/init.h>
26 #include <linux/netlink.h>
27 #include <linux/if_arp.h>
28
29 #ifdef  CONFIG_PROC_FS
30 #include <linux/proc_fs.h>
31 #endif
32
33 #include <net/snmp.h>
34 #include <net/ipv6.h>
35 #include <net/ip6_fib.h>
36 #include <net/ip6_route.h>
37 #include <net/ndisc.h>
38 #include <net/addrconf.h>
39 #include <net/tcp.h>
40 #include <linux/rtnetlink.h>
41
42 #include <asm/uaccess.h>
43
44 #ifdef CONFIG_SYSCTL
45 #include <linux/sysctl.h>
46 #endif
47
48 #undef CONFIG_RT6_POLICY
49
50 /* Set to 3 to get tracing. */
51 #define RT6_DEBUG 2
52
53 #if RT6_DEBUG >= 3
54 #define RDBG(x) printk x
55 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
56 #else
57 #define RDBG(x)
58 #define RT6_TRACE(x...) do { ; } while (0)
59 #endif
60
61
62 int ip6_rt_max_size = 4096;
63 int ip6_rt_gc_min_interval = 5*HZ;
64 int ip6_rt_gc_timeout = 60*HZ;
65 int ip6_rt_gc_interval = 30*HZ;
66 int ip6_rt_gc_elasticity = 9;
67 int ip6_rt_mtu_expires = 10*60*HZ;
68 int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
69
70 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
71 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
72 static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst,
73                                          struct sk_buff *skb);
74 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
75 static int               ip6_dst_gc(void);
76
77 static int              ip6_pkt_discard(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79
80 struct dst_ops ip6_dst_ops = {
81         AF_INET6,
82         __constant_htons(ETH_P_IPV6),
83         1024,
84
85         ip6_dst_gc,
86         ip6_dst_check,
87         ip6_dst_reroute,
88         NULL,
89         ip6_negative_advice,
90         ip6_link_failure,
91         sizeof(struct rt6_info),
92 };
93
94 struct rt6_info ip6_null_entry = {
95         {{NULL, ATOMIC_INIT(1), 1, &loopback_dev,
96           -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97           -ENETUNREACH, NULL, NULL,
98           ip6_pkt_discard, ip6_pkt_discard,
99 #ifdef CONFIG_NET_CLS_ROUTE
100           0,
101 #endif
102           &ip6_dst_ops}},
103         NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
104         255, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
105 };
106
107 struct fib6_node ip6_routing_table = {
108         NULL, NULL, NULL, NULL,
109         &ip6_null_entry,
110         0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0
111 };
112
113 #ifdef CONFIG_RT6_POLICY
114 int     ip6_rt_policy = 0;
115
116 struct pol_chain *rt6_pol_list = NULL;
117
118
119 static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb);
120 static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk);
121
122 static struct rt6_info  *rt6_flow_lookup(struct rt6_info *rt,
123                                          struct in6_addr *daddr,
124                                          struct in6_addr *saddr,
125                                          struct fl_acc_args *args);
126
127 #else
128 #define ip6_rt_policy (0)
129 #endif
130
131 /* Protects all the ip6 fib */
132
133 rwlock_t rt6_lock = RW_LOCK_UNLOCKED;
134
135
136 /*
137  *      Route lookup. Any rt6_lock is implied.
138  */
139
140 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
141                                                     int oif,
142                                                     int strict)
143 {
144         struct rt6_info *local = NULL;
145         struct rt6_info *sprt;
146
147         if (oif) {
148                 for (sprt = rt; sprt; sprt = sprt->u.next) {
149                         struct net_device *dev = sprt->rt6i_dev;
150                         if (dev->ifindex == oif)
151                                 return sprt;
152                         if (dev->flags&IFF_LOOPBACK)
153                                 local = sprt;
154                 }
155
156                 if (local)
157                         return local;
158
159                 if (strict)
160                         return &ip6_null_entry;
161         }
162         return rt;
163 }
164
165 /*
166  *      pointer to the last default router chosen. BH is disabled locally.
167  */
168 static struct rt6_info *rt6_dflt_pointer = NULL;
169 static spinlock_t rt6_dflt_lock = SPIN_LOCK_UNLOCKED;
170
171 static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
172 {
173         struct rt6_info *match = NULL;
174         struct rt6_info *sprt;
175         int mpri = 0;
176
177         for (sprt = rt; sprt; sprt = sprt->u.next) {
178                 struct neighbour *neigh;
179
180                 if ((neigh = sprt->rt6i_nexthop) != NULL) {
181                         int m = -1;
182
183                         switch (neigh->nud_state) {
184                         case NUD_REACHABLE:
185                                 if (sprt != rt6_dflt_pointer) {
186                                         rt = sprt;
187                                         goto out;
188                                 }
189                                 m = 2;
190                                 break;
191
192                         case NUD_DELAY:
193                                 m = 1;
194                                 break;
195
196                         case NUD_STALE:
197                                 m = 1;
198                                 break;
199                         };
200
201                         if (oif && sprt->rt6i_dev->ifindex == oif) {
202                                 m += 2;
203                         }
204
205                         if (m >= mpri) {
206                                 mpri = m;
207                                 match = sprt;
208                         }
209                 }
210         }
211
212         if (match) {
213                 rt = match;
214         } else {
215                 /*
216                  *      No default routers are known to be reachable.
217                  *      SHOULD round robin
218                  */
219                 spin_lock(&rt6_dflt_lock);
220                 if (rt6_dflt_pointer) {
221                         struct rt6_info *next;
222
223                         if ((next = rt6_dflt_pointer->u.next) != NULL &&
224                             next->u.dst.obsolete <= 0 &&
225                             next->u.dst.error == 0)
226                                 rt = next;
227                 }
228                 spin_unlock(&rt6_dflt_lock);
229         }
230
231 out:
232         spin_lock(&rt6_dflt_lock);
233         rt6_dflt_pointer = rt;
234         spin_unlock(&rt6_dflt_lock);
235         return rt;
236 }
237
238 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
239                             int oif, int strict)
240 {
241         struct fib6_node *fn;
242         struct rt6_info *rt;
243
244         read_lock_bh(&rt6_lock);
245         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
246         rt = rt6_device_match(fn->leaf, oif, strict);
247         dst_hold(&rt->u.dst);
248         rt->u.dst.__use++;
249         read_unlock_bh(&rt6_lock);
250
251         rt->u.dst.lastuse = jiffies;
252         if (rt->u.dst.error == 0)
253                 return rt;
254         dst_release(&rt->u.dst);
255         return NULL;
256 }
257
258 /* rt6_ins is called with FREE rt6_lock.
259    It takes new route entry, the addition fails by any reason the
260    route is freed. In any case, if caller does not hold it, it may
261    be destroyed.
262  */
263
264 static int rt6_ins(struct rt6_info *rt)
265 {
266         int err;
267
268         write_lock_bh(&rt6_lock);
269         err = fib6_add(&ip6_routing_table, rt);
270         write_unlock_bh(&rt6_lock);
271
272         return err;
273 }
274
275 /* No rt6_lock! If COW faild, the function returns dead route entry
276    with dst->error set to errno value.
277  */
278
279 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
280                                 struct in6_addr *saddr)
281 {
282         int err;
283         struct rt6_info *rt;
284
285         /*
286          *      Clone the route.
287          */
288
289         rt = ip6_rt_copy(ort);
290
291         if (rt) {
292                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
293
294                 if (!(rt->rt6i_flags&RTF_GATEWAY))
295                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
296
297                 rt->rt6i_dst.plen = 128;
298                 rt->rt6i_flags |= RTF_CACHE;
299                 rt->u.dst.flags |= DST_HOST;
300
301 #ifdef CONFIG_IPV6_SUBTREES
302                 if (rt->rt6i_src.plen && saddr) {
303                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
304                         rt->rt6i_src.plen = 128;
305                 }
306 #endif
307
308                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
309
310                 dst_clone(&rt->u.dst);
311
312                 err = rt6_ins(rt);
313                 if (err == 0)
314                         return rt;
315
316                 rt->u.dst.error = err;
317
318                 return rt;
319         }
320         dst_clone(&ip6_null_entry.u.dst);
321         return &ip6_null_entry;
322 }
323
324 #ifdef CONFIG_RT6_POLICY
325 static __inline__ struct rt6_info *rt6_flow_lookup_in(struct rt6_info *rt,
326                                                       struct sk_buff *skb)
327 {
328         struct in6_addr *daddr, *saddr;
329         struct fl_acc_args arg;
330
331         arg.type = FL_ARG_FORWARD;
332         arg.fl_u.skb = skb;
333
334         saddr = &skb->nh.ipv6h->saddr;
335         daddr = &skb->nh.ipv6h->daddr;
336
337         return rt6_flow_lookup(rt, daddr, saddr, &arg);
338 }
339
340 static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt,
341                                                        struct sock *sk,
342                                                        struct flowi *fl)
343 {
344         struct fl_acc_args arg;
345
346         arg.type = FL_ARG_ORIGIN;
347         arg.fl_u.fl_o.sk = sk;
348         arg.fl_u.fl_o.flow = fl;
349
350         return rt6_flow_lookup(rt, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr,
351                                &arg);
352 }
353
354 #endif
355
356 #define BACKTRACK() \
357 if (rt == &ip6_null_entry && strict) { \
358        while ((fn = fn->parent) != NULL) { \
359                 if (fn->fn_flags & RTN_ROOT) { \
360                         dst_clone(&rt->u.dst); \
361                         goto out; \
362                 } \
363                 if (fn->fn_flags & RTN_RTINFO) \
364                         goto restart; \
365         } \
366 }
367
368
369 void ip6_route_input(struct sk_buff *skb)
370 {
371         struct fib6_node *fn;
372         struct rt6_info *rt;
373         int strict;
374         int attempts = 3;
375
376         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
377
378 relookup:
379         read_lock_bh(&rt6_lock);
380
381         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
382                          &skb->nh.ipv6h->saddr);
383
384 restart:
385         rt = fn->leaf;
386
387         if ((rt->rt6i_flags & RTF_CACHE)) {
388                 if (ip6_rt_policy == 0) {
389                         rt = rt6_device_match(rt, skb->dev->ifindex, strict);
390                         BACKTRACK();
391                         dst_clone(&rt->u.dst);
392                         goto out;
393                 }
394
395 #ifdef CONFIG_RT6_POLICY
396                 if ((rt->rt6i_flags & RTF_FLOW)) {
397                         struct rt6_info *sprt;
398
399                         for (sprt = rt; sprt; sprt = sprt->u.next) {
400                                 if (rt6_flow_match_in(sprt, skb)) {
401                                         rt = sprt;
402                                         dst_clone(&rt->u.dst);
403                                         goto out;
404                                 }
405                         }
406                 }
407 #endif
408         }
409
410         rt = rt6_device_match(rt, skb->dev->ifindex, 0);
411         BACKTRACK();
412
413         if (ip6_rt_policy == 0) {
414                 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
415                         read_unlock_bh(&rt6_lock);
416
417                         rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
418                                      &skb->nh.ipv6h->saddr);
419                         
420                         if (rt->u.dst.error != -EEXIST || --attempts <= 0)
421                                 goto out2;
422                         /* Race condition! In the gap, when rt6_lock was
423                            released someone could insert this route.  Relookup.
424                          */
425                         goto relookup;
426                 }
427                 dst_clone(&rt->u.dst);
428         } else {
429 #ifdef CONFIG_RT6_POLICY
430                 rt = rt6_flow_lookup_in(rt, skb);
431 #else
432                 /* NEVER REACHED */
433 #endif
434         }
435
436 out:
437         read_unlock_bh(&rt6_lock);
438 out2:
439         rt->u.dst.lastuse = jiffies;
440         rt->u.dst.__use++;
441         skb->dst = (struct dst_entry *) rt;
442 }
443
444 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
445 {
446         struct fib6_node *fn;
447         struct rt6_info *rt;
448         int strict;
449         int attempts = 3;
450
451         strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
452
453 relookup:
454         read_lock_bh(&rt6_lock);
455
456         fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
457                          fl->nl_u.ip6_u.saddr);
458
459 restart:
460         rt = fn->leaf;
461
462         if ((rt->rt6i_flags & RTF_CACHE)) {
463                 if (ip6_rt_policy == 0) {
464                         rt = rt6_device_match(rt, fl->oif, strict);
465                         BACKTRACK();
466                         dst_clone(&rt->u.dst);
467                         goto out;
468                 }
469
470 #ifdef CONFIG_RT6_POLICY
471                 if ((rt->rt6i_flags & RTF_FLOW)) {
472                         struct rt6_info *sprt;
473
474                         for (sprt = rt; sprt; sprt = sprt->u.next) {
475                                 if (rt6_flow_match_out(sprt, sk)) {
476                                         rt = sprt;
477                                         dst_clone(&rt->u.dst);
478                                         goto out;
479                                 }
480                         }
481                 }
482 #endif
483         }
484         if (rt->rt6i_flags & RTF_DEFAULT) {
485                 if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
486                         rt = rt6_best_dflt(rt, fl->oif);
487         } else {
488                 rt = rt6_device_match(rt, fl->oif, strict);
489                 BACKTRACK();
490         }
491
492         if (ip6_rt_policy == 0) {
493                 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
494                         read_unlock_bh(&rt6_lock);
495
496                         rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
497                                      fl->nl_u.ip6_u.saddr);
498                         
499                         if (rt->u.dst.error != -EEXIST || --attempts <= 0)
500                                 goto out2;
501
502                         /* Race condition! In the gap, when rt6_lock was
503                            released someone could insert this route.  Relookup.
504                          */
505                         goto relookup;
506                 }
507                 dst_clone(&rt->u.dst);
508         } else {
509 #ifdef CONFIG_RT6_POLICY
510                 rt = rt6_flow_lookup_out(rt, sk, fl);
511 #else
512                 /* NEVER REACHED */
513 #endif
514         }
515
516 out:
517         read_unlock_bh(&rt6_lock);
518 out2:
519         rt->u.dst.lastuse = jiffies;
520         rt->u.dst.__use++;
521         return &rt->u.dst;
522 }
523
524
525 /*
526  *      Destination cache support functions
527  */
528
529 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
530 {
531         struct rt6_info *rt;
532
533         rt = (struct rt6_info *) dst;
534
535         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
536                 return dst;
537
538         dst_release(dst);
539         return NULL;
540 }
541
542 static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb)
543 {
544         /*
545          *      FIXME
546          */
547         RDBG(("ip6_dst_reroute(%p,%p)[%p] (AIEEE)\n", dst, skb,
548               __builtin_return_address(0)));
549         return NULL;
550 }
551
552 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
553 {
554         struct rt6_info *rt = (struct rt6_info *) dst;
555
556         if (rt) {
557                 if (rt->rt6i_flags & RTF_CACHE)
558                         ip6_del_rt(rt);
559                 else
560                         dst_release(dst);
561         }
562         return NULL;
563 }
564
565 static void ip6_link_failure(struct sk_buff *skb)
566 {
567         struct rt6_info *rt;
568
569         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
570
571         rt = (struct rt6_info *) skb->dst;
572         if (rt) {
573                 if (rt->rt6i_flags&RTF_CACHE) {
574                         dst_set_expires(&rt->u.dst, 0);
575                         rt->rt6i_flags |= RTF_EXPIRES;
576                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
577                         rt->rt6i_node->fn_sernum = -1;
578         }
579 }
580
581 static int ip6_dst_gc()
582 {
583         static unsigned expire = 30*HZ;
584         static unsigned long last_gc;
585         unsigned long now = jiffies;
586
587         if ((long)(now - last_gc) < ip6_rt_gc_min_interval &&
588             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
589                 goto out;
590
591         expire++;
592         fib6_run_gc(expire);
593         last_gc = now;
594         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
595                 expire = ip6_rt_gc_timeout>>1;
596
597 out:
598         expire -= expire>>ip6_rt_gc_elasticity;
599         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
600 }
601
602 /* Clean host part of a prefix. Not necessary in radix tree,
603    but results in cleaner routing tables.
604
605    Remove it only when all the things will work!
606  */
607
608 static void ipv6_wash_prefix(struct in6_addr *pfx, int plen)
609 {
610         int b = plen&0x7;
611         int o = (plen + 7)>>3;
612
613         if (o < 16)
614                 memset(pfx->s6_addr + o, 0, 16 - o);
615         if (b != 0)
616                 pfx->s6_addr[plen>>3] &= (0xFF<<(8-b));
617 }
618
619 static int ipv6_get_mtu(struct net_device *dev)
620 {
621         int mtu = IPV6_MIN_MTU;
622         struct inet6_dev *idev;
623
624         idev = in6_dev_get(dev);
625         if (idev) {
626                 mtu = idev->cnf.mtu6;
627                 in6_dev_put(idev);
628         }
629         return mtu;
630 }
631
632 static int ipv6_get_hoplimit(struct net_device *dev)
633 {
634         int hoplimit = ipv6_devconf.hop_limit;
635         struct inet6_dev *idev;
636
637         idev = in6_dev_get(dev);
638         if (idev) {
639                 hoplimit = idev->cnf.hop_limit;
640                 in6_dev_put(idev);
641         }
642         return hoplimit;
643 }
644
645 /*
646  *
647  */
648
649 int ip6_route_add(struct in6_rtmsg *rtmsg)
650 {
651         int err;
652         struct rt6_info *rt;
653         struct net_device *dev = NULL;
654         int addr_type;
655
656         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
657                 return -EINVAL;
658 #ifndef CONFIG_IPV6_SUBTREES
659         if (rtmsg->rtmsg_src_len)
660                 return -EINVAL;
661 #endif
662         if (rtmsg->rtmsg_metric == 0)
663                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
664
665         rt = dst_alloc(&ip6_dst_ops);
666
667         if (rt == NULL)
668                 return -ENOMEM;
669
670         rt->u.dst.obsolete = -1;
671         rt->rt6i_expires = rtmsg->rtmsg_info;
672
673         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
674
675         if (addr_type & IPV6_ADDR_MULTICAST)
676                 rt->u.dst.input = ip6_mc_input;
677         else
678                 rt->u.dst.input = ip6_forward;
679
680         rt->u.dst.output = ip6_output;
681
682         if (rtmsg->rtmsg_ifindex) {
683                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
684                 err = -ENODEV;
685                 if (dev == NULL)
686                         goto out;
687         }
688
689         ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst);
690         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
691         if (rt->rt6i_dst.plen == 128)
692                rt->u.dst.flags = DST_HOST;
693         ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen);
694
695 #ifdef CONFIG_IPV6_SUBTREES
696         ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src);
697         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
698         ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
699 #endif
700
701         rt->rt6i_metric = rtmsg->rtmsg_metric;
702
703         /* We cannot add true routes via loopback here,
704            they would result in kernel looping; promote them to reject routes
705          */
706         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
707             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
708                 if (dev)
709                         dev_put(dev);
710                 dev = &loopback_dev;
711                 dev_hold(dev);
712                 rt->u.dst.output = ip6_pkt_discard;
713                 rt->u.dst.input = ip6_pkt_discard;
714                 rt->u.dst.error = -ENETUNREACH;
715                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
716                 goto install_route;
717         }
718
719         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
720                 struct in6_addr *gw_addr;
721                 int gwa_type;
722
723                 gw_addr = &rtmsg->rtmsg_gateway;
724                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
725                 gwa_type = ipv6_addr_type(gw_addr);
726
727                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
728                         struct rt6_info *grt;
729
730                         /* IPv6 strictly inhibits using not link-local
731                            addresses as nexthop address.
732                            Otherwise, router will not able to send redirects.
733                            It is very good, but in some (rare!) curcumstances
734                            (SIT, PtP, NBMA NOARP links) it is handy to allow
735                            some exceptions. --ANK
736                          */
737                         err = -EINVAL;
738                         if (!(gwa_type&IPV6_ADDR_UNICAST))
739                                 goto out;
740
741                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
742
743                         err = -EHOSTUNREACH;
744                         if (grt == NULL)
745                                 goto out;
746                         if (dev) {
747                                 if (dev != grt->rt6i_dev) {
748                                         dst_release(&grt->u.dst);
749                                         goto out;
750                                 }
751                         } else {
752                                 dev = grt->rt6i_dev;
753                                 dev_hold(dev);
754                         }
755                         if (!(grt->rt6i_flags&RTF_GATEWAY))
756                                 err = 0;
757                         dst_release(&grt->u.dst);
758
759                         if (err)
760                                 goto out;
761                 }
762                 err = -EINVAL;
763                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
764                         goto out;
765         }
766
767         err = -ENODEV;
768         if (dev == NULL)
769                 goto out;
770
771         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
772                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
773                 if (IS_ERR(rt->rt6i_nexthop)) {
774                         err = PTR_ERR(rt->rt6i_nexthop);
775                         rt->rt6i_nexthop = NULL;
776                         goto out;
777                 }
778         }
779
780         if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
781                 rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
782         else
783                 rt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
784         rt->rt6i_flags = rtmsg->rtmsg_flags;
785
786 install_route:
787         rt->u.dst.pmtu = ipv6_get_mtu(dev);
788         rt->u.dst.advmss = max(rt->u.dst.pmtu - 60, ip6_rt_min_advmss);
789         /* Maximal non-jumbo IPv6 payload is 65535 and corresponding
790            MSS is 65535 - tcp_header_size. 65535 is also valid and
791            means: "any MSS, rely only on pmtu discovery"
792          */
793         if (rt->u.dst.advmss > 65535-20)
794                 rt->u.dst.advmss = 65535;
795         rt->u.dst.dev = dev;
796         return rt6_ins(rt);
797
798 out:
799         if (dev)
800                 dev_put(dev);
801         dst_free((struct dst_entry *) rt);
802         return err;
803 }
804
805 int ip6_del_rt(struct rt6_info *rt)
806 {
807         int err;
808
809         write_lock_bh(&rt6_lock);
810
811         spin_lock_bh(&rt6_dflt_lock);
812         rt6_dflt_pointer = NULL;
813         spin_unlock_bh(&rt6_dflt_lock);
814
815         dst_release(&rt->u.dst);
816
817         err = fib6_del(rt);
818         write_unlock_bh(&rt6_lock);
819
820         return err;
821 }
822
823 int ip6_route_del(struct in6_rtmsg *rtmsg)
824 {
825         struct fib6_node *fn;
826         struct rt6_info *rt;
827         int err = -ESRCH;
828
829         read_lock_bh(&rt6_lock);
830
831         fn = fib6_locate(&ip6_routing_table,
832                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
833                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
834         
835         if (fn) {
836                 for (rt = fn->leaf; rt; rt = rt->u.next) {
837                         if (rtmsg->rtmsg_ifindex &&
838                             (rt->rt6i_dev == NULL ||
839                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
840                                 continue;
841                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
842                             ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
843                                 continue;
844                         if (rtmsg->rtmsg_metric &&
845                             rtmsg->rtmsg_metric != rt->rt6i_metric)
846                                 continue;
847                         dst_clone(&rt->u.dst);
848                         read_unlock_bh(&rt6_lock);
849
850                         return ip6_del_rt(rt);
851                 }
852         }
853         read_unlock_bh(&rt6_lock);
854
855         return err;
856 }
857
858 /*
859  *      Handle redirects
860  */
861 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
862                   struct neighbour *neigh, int on_link)
863 {
864         struct rt6_info *rt, *nrt;
865
866         /* Locate old route to this destination. */
867         rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
868
869         if (rt == NULL)
870                 return;
871
872         if (neigh->dev != rt->rt6i_dev)
873                 goto out;
874
875         /* Redirect received -> path was valid.
876            Look, redirects are sent only in response to data packets,
877            so that this nexthop apparently is reachable. --ANK
878          */
879         dst_confirm(&rt->u.dst);
880
881         /* Duplicate redirect: silently ignore. */
882         if (neigh == rt->u.dst.neighbour)
883                 goto out;
884
885         /* Current route is on-link; redirect is always invalid.
886            
887            Seems, previous statement is not true. It could
888            be node, which looks for us as on-link (f.e. proxy ndisc)
889            But then router serving it might decide, that we should
890            know truth 8)8) --ANK (980726).
891          */
892         if (!(rt->rt6i_flags&RTF_GATEWAY))
893                 goto out;
894
895 #if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB)
896         /*
897          *      During transition gateways have more than
898          *      one link local address. Certainly, it is violation
899          *      of basic principles, but it is temporary.
900          */
901         /*
902          *      RFC 1970 specifies that redirects should only be
903          *      accepted if they come from the nexthop to the target.
904          *      Due to the way default routers are chosen, this notion
905          *      is a bit fuzzy and one might need to check all default
906          *      routers.
907          */
908
909         if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
910                 if (rt->rt6i_flags & RTF_DEFAULT) {
911                         struct rt6_info *rt1;
912
913                         read_lock(&rt6_lock);
914                         for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
915                                 if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
916                                         dst_clone(&rt1->u.dst);
917                                         dst_release(&rt->u.dst);
918                                         read_unlock(&rt6_lock);
919                                         rt = rt1;
920                                         goto source_ok;
921                                 }
922                         }
923                         read_unlock(&rt6_lock);
924                 }
925                 if (net_ratelimit())
926                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
927                                "for redirect target\n");
928                 goto out;
929         }
930
931 source_ok:
932 #endif
933
934         /*
935          *      We have finally decided to accept it.
936          */
937
938         nrt = ip6_rt_copy(rt);
939         if (nrt == NULL)
940                 goto out;
941
942         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
943         if (on_link)
944                 nrt->rt6i_flags &= ~RTF_GATEWAY;
945
946         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
947         nrt->rt6i_dst.plen = 128;
948         nrt->u.dst.flags |= DST_HOST;
949
950         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
951         nrt->rt6i_nexthop = neigh_clone(neigh);
952         /* Reset pmtu, it may be better */
953         nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev);
954         nrt->u.dst.advmss = max(nrt->u.dst.pmtu - 60, ip6_rt_min_advmss);
955         if (rt->u.dst.advmss > 65535-20)
956                 rt->u.dst.advmss = 65535;
957         nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
958
959         if (rt6_ins(nrt))
960                 goto out;
961
962         if (rt->rt6i_flags&RTF_CACHE) {
963                 ip6_del_rt(rt);
964                 return;
965         }
966
967 out:
968         dst_release(&rt->u.dst);
969         return;
970 }
971
972 /*
973  *      Handle ICMP "packet too big" messages
974  *      i.e. Path MTU discovery
975  */
976
977 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
978                         struct net_device *dev, u32 pmtu)
979 {
980         struct rt6_info *rt, *nrt;
981
982         if (pmtu < IPV6_MIN_MTU) {
983                 if (net_ratelimit())
984                         printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
985                                pmtu);
986                 return;
987         }
988
989         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
990
991         if (rt == NULL)
992                 return;
993
994         if (pmtu >= rt->u.dst.pmtu)
995                 goto out;
996
997         /* New mtu received -> path was valid.
998            They are sent only in response to data packets,
999            so that this nexthop apparently is reachable. --ANK
1000          */
1001         dst_confirm(&rt->u.dst);
1002
1003         /* Host route. If it is static, it would be better
1004            not to override it, but add new one, so that
1005            when cache entry will expire old pmtu
1006            would return automatically.
1007          */
1008         if (rt->rt6i_flags & RTF_CACHE) {
1009                 rt->u.dst.pmtu = pmtu;
1010                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1011                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1012                 goto out;
1013         }
1014
1015         /* Network route.
1016            Two cases are possible:
1017            1. It is connected route. Action: COW
1018            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1019          */
1020         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1021                 nrt = rt6_cow(rt, daddr, saddr);
1022                 if (!nrt->u.dst.error) {
1023                         nrt->u.dst.pmtu = pmtu;
1024                         dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1025                         nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1026                         dst_release(&nrt->u.dst);
1027                 }
1028         } else {
1029                 nrt = ip6_rt_copy(rt);
1030                 if (nrt == NULL)
1031                         goto out;
1032                 ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
1033                 nrt->rt6i_dst.plen = 128;
1034                 nrt->u.dst.flags |= DST_HOST;
1035                 nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
1036                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1037                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
1038                 nrt->u.dst.pmtu = pmtu;
1039                 rt6_ins(nrt);
1040         }
1041
1042 out:
1043         dst_release(&rt->u.dst);
1044 }
1045
1046 /*
1047  *      Misc support functions
1048  */
1049
1050 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1051 {
1052         struct rt6_info *rt;
1053
1054         rt = dst_alloc(&ip6_dst_ops);
1055
1056         if (rt) {
1057                 rt->u.dst.input = ort->u.dst.input;
1058                 rt->u.dst.output = ort->u.dst.output;
1059
1060                 memcpy(&rt->u.dst.mxlock, &ort->u.dst.mxlock, RTAX_MAX*sizeof(unsigned));
1061                 rt->u.dst.dev = ort->u.dst.dev;
1062                 if (rt->u.dst.dev)
1063                         dev_hold(rt->u.dst.dev);
1064                 rt->u.dst.lastuse = jiffies;
1065                 rt->rt6i_hoplimit = ort->rt6i_hoplimit;
1066                 rt->rt6i_expires = 0;
1067
1068                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1069                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1070                 rt->rt6i_metric = 0;
1071
1072                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1073 #ifdef CONFIG_IPV6_SUBTREES
1074                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1075 #endif
1076         }
1077         return rt;
1078 }
1079
1080 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1081 {       
1082         struct rt6_info *rt;
1083         struct fib6_node *fn;
1084
1085         fn = &ip6_routing_table;
1086
1087         write_lock_bh(&rt6_lock);
1088         for (rt = fn->leaf; rt; rt=rt->u.next) {
1089                 if (dev == rt->rt6i_dev &&
1090                     ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
1091                         break;
1092         }
1093         if (rt)
1094                 dst_clone(&rt->u.dst);
1095         write_unlock_bh(&rt6_lock);
1096         return rt;
1097 }
1098
1099 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1100                                      struct net_device *dev)
1101 {
1102         struct in6_rtmsg rtmsg;
1103
1104         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1105         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1106         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1107         rtmsg.rtmsg_metric = 1024;
1108         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP;
1109
1110         rtmsg.rtmsg_ifindex = dev->ifindex;
1111
1112         ip6_route_add(&rtmsg);
1113         return rt6_get_dflt_router(gwaddr, dev);
1114 }
1115
1116 void rt6_purge_dflt_routers(int last_resort)
1117 {
1118         struct rt6_info *rt;
1119         u32 flags;
1120
1121         if (last_resort)
1122                 flags = RTF_ALLONLINK;
1123         else
1124                 flags = RTF_DEFAULT | RTF_ADDRCONF;     
1125
1126 restart:
1127         read_lock_bh(&rt6_lock);
1128         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1129                 if (rt->rt6i_flags & flags) {
1130                         dst_hold(&rt->u.dst);
1131
1132                         spin_lock_bh(&rt6_dflt_lock);
1133                         rt6_dflt_pointer = NULL;
1134                         spin_unlock_bh(&rt6_dflt_lock);
1135
1136                         read_unlock_bh(&rt6_lock);
1137
1138                         ip6_del_rt(rt);
1139
1140                         goto restart;
1141                 }
1142         }
1143         read_unlock_bh(&rt6_lock);
1144 }
1145
1146 int ipv6_route_ioctl(unsigned int cmd, void *arg)
1147 {
1148         struct in6_rtmsg rtmsg;
1149         int err;
1150
1151         switch(cmd) {
1152         case SIOCADDRT:         /* Add a route */
1153         case SIOCDELRT:         /* Delete a route */
1154                 if (!capable(CAP_NET_ADMIN))
1155                         return -EPERM;
1156                 err = copy_from_user(&rtmsg, arg,
1157                                      sizeof(struct in6_rtmsg));
1158                 if (err)
1159                         return -EFAULT;
1160                         
1161                 rtnl_lock();
1162                 switch (cmd) {
1163                 case SIOCADDRT:
1164                         err = ip6_route_add(&rtmsg);
1165                         break;
1166                 case SIOCDELRT:
1167                         err = ip6_route_del(&rtmsg);
1168                         break;
1169                 default:
1170                         err = -EINVAL;
1171                 }
1172                 rtnl_unlock();
1173
1174                 return err;
1175         };
1176
1177         return -EINVAL;
1178 }
1179
1180 /*
1181  *      Drop the packet on the floor
1182  */
1183
1184 int ip6_pkt_discard(struct sk_buff *skb)
1185 {
1186         IP6_INC_STATS(Ip6OutNoRoutes);
1187         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
1188         kfree_skb(skb);
1189         return 0;
1190 }
1191
1192 /*
1193  *      Add address
1194  */
1195
1196 int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev)
1197 {
1198         struct rt6_info *rt;
1199
1200         rt = dst_alloc(&ip6_dst_ops);
1201         if (rt == NULL)
1202                 return -ENOMEM;
1203
1204         rt->u.dst.flags = DST_HOST;
1205         rt->u.dst.input = ip6_input;
1206         rt->u.dst.output = ip6_output;
1207         rt->rt6i_dev = dev_get_by_name("lo");
1208         rt->u.dst.pmtu = ipv6_get_mtu(rt->rt6i_dev);
1209         rt->u.dst.advmss = max(rt->u.dst.pmtu - 60, ip6_rt_min_advmss);
1210         if (rt->u.dst.advmss > 65535-20)
1211                 rt->u.dst.advmss = 65535;
1212         rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev);
1213         rt->u.dst.obsolete = -1;
1214
1215         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1216         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1217         if (rt->rt6i_nexthop == NULL) {
1218                 dst_free((struct dst_entry *) rt);
1219                 return -ENOMEM;
1220         }
1221
1222         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1223         rt->rt6i_dst.plen = 128;
1224         rt6_ins(rt);
1225
1226         return 0;
1227 }
1228
1229 /* Delete address. Warning: you should check that this address
1230    disappeared before calling this function.
1231  */
1232
1233 int ip6_rt_addr_del(struct in6_addr *addr, struct net_device *dev)
1234 {
1235         struct rt6_info *rt;
1236         int err = -ENOENT;
1237
1238         rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
1239         if (rt) {
1240                 if (rt->rt6i_dst.plen == 128)
1241                         err = ip6_del_rt(rt);
1242                 else
1243                         dst_release(&rt->u.dst);
1244         }
1245
1246         return err;
1247 }
1248
1249 #ifdef CONFIG_RT6_POLICY
1250
1251 static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb)
1252 {
1253         struct flow_filter *frule;
1254         struct pkt_filter *filter;
1255         int res = 1;
1256
1257         if ((frule = rt->rt6i_filter) == NULL)
1258                 goto out;
1259
1260         if (frule->type != FLR_INPUT) {
1261                 res = 0;
1262                 goto out;
1263         }
1264
1265         for (filter = frule->u.filter; filter; filter = filter->next) {
1266                 __u32 *word;
1267
1268                 word = (__u32 *) skb->h.raw;
1269                 word += filter->offset;
1270
1271                 if ((*word ^ filter->value) & filter->mask) {
1272                         res = 0;
1273                         break;
1274                 }
1275         }
1276
1277 out:
1278         return res;
1279 }
1280
1281 static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk)
1282 {
1283         struct flow_filter *frule;
1284         int res = 1;
1285
1286         if ((frule = rt->rt6i_filter) == NULL)
1287                 goto out;
1288
1289         if (frule->type != FLR_INPUT) {
1290                 res = 0;
1291                 goto out;
1292         }
1293
1294         if (frule->u.sk != sk)
1295                 res = 0;
1296 out:
1297         return res;
1298 }
1299
1300 static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
1301                                         struct in6_addr *daddr,
1302                                         struct in6_addr *saddr,
1303                                         struct fl_acc_args *args)
1304 {
1305         struct flow_rule *frule;
1306         struct rt6_info *nrt = NULL;
1307         struct pol_chain *pol;
1308
1309         for (pol = rt6_pol_list; pol; pol = pol->next) {
1310                 struct fib6_node *fn;
1311                 struct rt6_info *sprt;
1312
1313                 fn = fib6_lookup(pol->rules, daddr, saddr);
1314
1315                 do {
1316                         for (sprt = fn->leaf; sprt; sprt=sprt->u.next) {
1317                                 int res;
1318
1319                                 frule = sprt->rt6i_flowr;
1320 #if RT6_DEBUG >= 2
1321                                 if (frule == NULL) {
1322                                         printk(KERN_DEBUG "NULL flowr\n");
1323                                         goto error;
1324                                 }
1325 #endif
1326                                 res = frule->ops->accept(rt, sprt, args, &nrt);
1327
1328                                 switch (res) {
1329                                 case FLOWR_SELECT:
1330                                         goto found;
1331                                 case FLOWR_CLEAR:
1332                                         goto next_policy;
1333                                 case FLOWR_NODECISION:
1334                                         break;
1335                                 default:
1336                                         goto error;
1337                                 };
1338                         }
1339
1340                         fn = fn->parent;
1341
1342                 } while ((fn->fn_flags & RTN_TL_ROOT) == 0);
1343
1344         next_policy:
1345         }
1346
1347 error:
1348         dst_clone(&ip6_null_entry.u.dst);
1349         return &ip6_null_entry;
1350
1351 found:
1352         if (nrt == NULL)
1353                 goto error;
1354
1355         nrt->rt6i_flags |= RTF_CACHE;
1356         dst_clone(&nrt->u.dst);
1357         err = rt6_ins(nrt);
1358         if (err)
1359                 nrt->u.dst.error = err;
1360         return nrt;
1361 }
1362 #endif
1363
1364 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1365 {
1366         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1367             rt != &ip6_null_entry) {
1368                 RT6_TRACE("deleted by ifdown %p\n", rt);
1369                 return -1;
1370         }
1371         return 0;
1372 }
1373
1374 void rt6_ifdown(struct net_device *dev)
1375 {
1376         write_lock_bh(&rt6_lock);
1377         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1378         write_unlock_bh(&rt6_lock);
1379 }
1380
1381 struct rt6_mtu_change_arg
1382 {
1383         struct net_device *dev;
1384         unsigned mtu;
1385 };
1386
1387 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1388 {
1389         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1390
1391         /* In IPv6 pmtu discovery is not optional,
1392            so that RTAX_MTU lock cannot disable it.
1393            We still use this lock to block changes
1394            caused by addrconf/ndisc.
1395         */
1396         if (rt->rt6i_dev == arg->dev &&
1397             !(rt->u.dst.mxlock&(1<<RTAX_MTU)))
1398                 rt->u.dst.pmtu = arg->mtu;
1399         rt->u.dst.advmss = max(arg->mtu - 60, ip6_rt_min_advmss);
1400         if (rt->u.dst.advmss > 65535-20)
1401                 rt->u.dst.advmss = 65535;
1402         return 0;
1403 }
1404
1405 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1406 {
1407         struct rt6_mtu_change_arg arg;
1408
1409         arg.dev = dev;
1410         arg.mtu = mtu;
1411         read_lock_bh(&rt6_lock);
1412         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1413         read_unlock_bh(&rt6_lock);
1414 }
1415
1416 #ifdef CONFIG_RTNETLINK
1417
1418 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1419                               struct in6_rtmsg *rtmsg)
1420 {
1421         memset(rtmsg, 0, sizeof(*rtmsg));
1422
1423         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1424         rtmsg->rtmsg_src_len = r->rtm_src_len;
1425         rtmsg->rtmsg_flags = RTF_UP;
1426         if (r->rtm_type == RTN_UNREACHABLE)
1427                 rtmsg->rtmsg_flags |= RTF_REJECT;
1428
1429         if (rta[RTA_GATEWAY-1]) {
1430                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1431                         return -EINVAL;
1432                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1433                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1434         }
1435         if (rta[RTA_DST-1]) {
1436                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1437                         return -EINVAL;
1438                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1439         }
1440         if (rta[RTA_SRC-1]) {
1441                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1442                         return -EINVAL;
1443                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1444         }
1445         if (rta[RTA_OIF-1]) {
1446                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1447                         return -EINVAL;
1448                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1449         }
1450         if (rta[RTA_PRIORITY-1]) {
1451                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1452                         return -EINVAL;
1453                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1454         }
1455         return 0;
1456 }
1457
1458 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1459 {
1460         struct rtmsg *r = NLMSG_DATA(nlh);
1461         struct in6_rtmsg rtmsg;
1462
1463         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1464                 return -EINVAL;
1465         return ip6_route_del(&rtmsg);
1466 }
1467
1468 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1469 {
1470         struct rtmsg *r = NLMSG_DATA(nlh);
1471         struct in6_rtmsg rtmsg;
1472
1473         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1474                 return -EINVAL;
1475         return ip6_route_add(&rtmsg);
1476 }
1477
1478 struct rt6_rtnl_dump_arg
1479 {
1480         struct sk_buff *skb;
1481         struct netlink_callback *cb;
1482 };
1483
1484 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1485                          struct in6_addr *dst,
1486                          struct in6_addr *src,
1487                          int iif,
1488                          int type, u32 pid, u32 seq)
1489 {
1490         struct rtmsg *rtm;
1491         struct nlmsghdr  *nlh;
1492         unsigned char    *b = skb->tail;
1493         struct rta_cacheinfo ci;
1494
1495         nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
1496         rtm = NLMSG_DATA(nlh);
1497         rtm->rtm_family = AF_INET6;
1498         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1499         rtm->rtm_src_len = rt->rt6i_src.plen;
1500         rtm->rtm_tos = 0;
1501         rtm->rtm_table = RT_TABLE_MAIN;
1502         if (rt->rt6i_flags&RTF_REJECT)
1503                 rtm->rtm_type = RTN_UNREACHABLE;
1504         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1505                 rtm->rtm_type = RTN_LOCAL;
1506         else
1507                 rtm->rtm_type = RTN_UNICAST;
1508         rtm->rtm_flags = 0;
1509         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1510         rtm->rtm_protocol = RTPROT_BOOT;
1511         if (rt->rt6i_flags&RTF_DYNAMIC)
1512                 rtm->rtm_protocol = RTPROT_REDIRECT;
1513         else if (rt->rt6i_flags&(RTF_ADDRCONF|RTF_ALLONLINK))
1514                 rtm->rtm_protocol = RTPROT_KERNEL;
1515         else if (rt->rt6i_flags&RTF_DEFAULT)
1516                 rtm->rtm_protocol = RTPROT_RA;
1517
1518         if (rt->rt6i_flags&RTF_CACHE)
1519                 rtm->rtm_flags |= RTM_F_CLONED;
1520
1521         if (dst) {
1522                 RTA_PUT(skb, RTA_DST, 16, dst);
1523                 rtm->rtm_dst_len = 128;
1524         } else if (rtm->rtm_dst_len)
1525                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1526 #ifdef CONFIG_IPV6_SUBTREES
1527         if (src) {
1528                 RTA_PUT(skb, RTA_SRC, 16, src);
1529                 rtm->rtm_src_len = 128;
1530         } else if (rtm->rtm_src_len)
1531                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1532 #endif
1533         if (iif)
1534                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1535         else if (dst) {
1536                 struct in6_addr saddr_buf;
1537                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1538                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1539         }
1540         if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
1541                 goto rtattr_failure;
1542         if (rt->u.dst.neighbour)
1543                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1544         if (rt->u.dst.dev)
1545                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1546         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1547         ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1548         if (rt->rt6i_expires)
1549                 ci.rta_expires = rt->rt6i_expires - jiffies;
1550         else
1551                 ci.rta_expires = 0;
1552         ci.rta_used = rt->u.dst.__use;
1553         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1554         ci.rta_error = rt->u.dst.error;
1555         ci.rta_id = 0;
1556         ci.rta_ts = 0;
1557         ci.rta_tsage = 0;
1558         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1559         nlh->nlmsg_len = skb->tail - b;
1560         return skb->len;
1561
1562 nlmsg_failure:
1563 rtattr_failure:
1564         skb_trim(skb, b - skb->data);
1565         return -1;
1566 }
1567
1568 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1569 {
1570         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1571
1572         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1573                              NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq);
1574 }
1575
1576 static int fib6_dump_node(struct fib6_walker_t *w)
1577 {
1578         int res;
1579         struct rt6_info *rt;
1580
1581         for (rt = w->leaf; rt; rt = rt->u.next) {
1582                 res = rt6_dump_route(rt, w->args);
1583                 if (res < 0) {
1584                         /* Frame is full, suspend walking */
1585                         w->leaf = rt;
1586                         return 1;
1587                 }
1588                 BUG_TRAP(res!=0);
1589         }
1590         w->leaf = NULL;
1591         return 0;
1592 }
1593
1594 static void fib6_dump_end(struct netlink_callback *cb)
1595 {
1596         struct fib6_walker_t *w = (void*)cb->args[0];
1597
1598         if (w) {
1599                 cb->args[0] = 0;
1600                 fib6_walker_unlink(w);
1601                 kfree(w);
1602         }
1603         if (cb->args[1]) {
1604                 cb->done = (void*)cb->args[1];
1605                 cb->args[1] = 0;
1606         }
1607 }
1608
1609 static int fib6_dump_done(struct netlink_callback *cb)
1610 {
1611         fib6_dump_end(cb);
1612         return cb->done(cb);
1613 }
1614
1615 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1616 {
1617         struct rt6_rtnl_dump_arg arg;
1618         struct fib6_walker_t *w;
1619         int res;
1620
1621         arg.skb = skb;
1622         arg.cb = cb;
1623
1624         w = (void*)cb->args[0];
1625         if (w == NULL) {
1626                 /* New dump:
1627                  * 
1628                  * 1. hook callback destructor.
1629                  */
1630                 cb->args[1] = (long)cb->done;
1631                 cb->done = fib6_dump_done;
1632
1633                 /*
1634                  * 2. allocate and initialize walker.
1635                  */
1636                 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1637                 if (w == NULL)
1638                         return -ENOMEM;
1639                 RT6_TRACE("dump<%p", w);
1640                 memset(w, 0, sizeof(*w));
1641                 w->root = &ip6_routing_table;
1642                 w->func = fib6_dump_node;
1643                 w->args = &arg;
1644                 cb->args[0] = (long)w;
1645                 read_lock_bh(&rt6_lock);
1646                 res = fib6_walk(w);
1647                 read_unlock_bh(&rt6_lock);
1648         } else {
1649                 w->args = &arg;
1650                 read_lock_bh(&rt6_lock);
1651                 res = fib6_walk_continue(w);
1652                 read_unlock_bh(&rt6_lock);
1653         }
1654 #if RT6_DEBUG >= 3
1655         if (res <= 0 && skb->len == 0)
1656                 RT6_TRACE("%p>dump end\n", w);
1657 #endif
1658         res = res < 0 ? res : skb->len;
1659         /* res < 0 is an error. (really, impossible)
1660            res == 0 means that dump is complete, but skb still can contain data.
1661            res > 0 dump is not complete, but frame is full.
1662          */
1663         /* Destroy walker, if dump of this table is complete. */
1664         if (res <= 0)
1665                 fib6_dump_end(cb);
1666         return res;
1667 }
1668
1669 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1670 {
1671         struct rtattr **rta = arg;
1672         int iif = 0;
1673         int err;
1674         struct sk_buff *skb;
1675         struct flowi fl;
1676         struct rt6_info *rt;
1677
1678         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1679         if (skb == NULL)
1680                 return -ENOBUFS;
1681
1682         /* Reserve room for dummy headers, this skb can pass
1683            through good chunk of routing engine.
1684          */
1685         skb->mac.raw = skb->data;
1686         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1687
1688         fl.proto = 0;
1689         fl.nl_u.ip6_u.daddr = NULL;
1690         fl.nl_u.ip6_u.saddr = NULL;
1691         fl.uli_u.icmpt.type = 0;
1692         fl.uli_u.icmpt.code = 0;
1693         if (rta[RTA_SRC-1])
1694                 fl.nl_u.ip6_u.saddr = (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]);
1695         if (rta[RTA_DST-1])
1696                 fl.nl_u.ip6_u.daddr = (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]);
1697
1698         if (rta[RTA_IIF-1])
1699                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1700
1701         if (iif) {
1702                 struct net_device *dev;
1703                 dev = __dev_get_by_index(iif);
1704                 if (!dev)
1705                         return -ENODEV;
1706         }
1707
1708         fl.oif = 0;
1709         if (rta[RTA_OIF-1])
1710                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1711
1712         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1713
1714         skb->dst = &rt->u.dst;
1715
1716         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1717         err = rt6_fill_node(skb, rt, 
1718                             fl.nl_u.ip6_u.daddr,
1719                             fl.nl_u.ip6_u.saddr,
1720                             iif,
1721                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq);
1722         if (err < 0)
1723                 return -EMSGSIZE;
1724
1725         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1726         if (err < 0)
1727                 return err;
1728         return 0;
1729 }
1730
1731 void inet6_rt_notify(int event, struct rt6_info *rt)
1732 {
1733         struct sk_buff *skb;
1734         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1735
1736         skb = alloc_skb(size, gfp_any());
1737         if (!skb) {
1738                 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1739                 return;
1740         }
1741         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0) < 0) {
1742                 kfree_skb(skb);
1743                 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1744                 return;
1745         }
1746         NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
1747         netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
1748 }
1749
1750 #endif
1751
1752 /*
1753  *      /proc
1754  */
1755
1756 #ifdef CONFIG_PROC_FS
1757
1758 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1759
1760 struct rt6_proc_arg
1761 {
1762         char *buffer;
1763         int offset;
1764         int length;
1765         int skip;
1766         int len;
1767 };
1768
1769 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1770 {
1771         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1772         int i;
1773
1774         if (arg->skip < arg->offset / RT6_INFO_LEN) {
1775                 arg->skip++;
1776                 return 0;
1777         }
1778
1779         if (arg->len >= arg->length)
1780                 return 0;
1781
1782         for (i=0; i<16; i++) {
1783                 sprintf(arg->buffer + arg->len, "%02x",
1784                         rt->rt6i_dst.addr.s6_addr[i]);
1785                 arg->len += 2;
1786         }
1787         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1788                             rt->rt6i_dst.plen);
1789
1790 #ifdef CONFIG_IPV6_SUBTREES
1791         for (i=0; i<16; i++) {
1792                 sprintf(arg->buffer + arg->len, "%02x",
1793                         rt->rt6i_src.addr.s6_addr[i]);
1794                 arg->len += 2;
1795         }
1796         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1797                             rt->rt6i_src.plen);
1798 #else
1799         sprintf(arg->buffer + arg->len,
1800                 "00000000000000000000000000000000 00 ");
1801         arg->len += 36;
1802 #endif
1803
1804         if (rt->rt6i_nexthop) {
1805                 for (i=0; i<16; i++) {
1806                         sprintf(arg->buffer + arg->len, "%02x",
1807                                 rt->rt6i_nexthop->primary_key[i]);
1808                         arg->len += 2;
1809                 }
1810         } else {
1811                 sprintf(arg->buffer + arg->len,
1812                         "00000000000000000000000000000000");
1813                 arg->len += 32;
1814         }
1815         arg->len += sprintf(arg->buffer + arg->len,
1816                             " %08x %08x %08x %08x %8s\n",
1817                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1818                             rt->u.dst.__use, rt->rt6i_flags, 
1819                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
1820         return 0;
1821 }
1822
1823 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1824 {
1825         struct rt6_proc_arg arg;
1826         arg.buffer = buffer;
1827         arg.offset = offset;
1828         arg.length = length;
1829         arg.skip = 0;
1830         arg.len = 0;
1831
1832         read_lock_bh(&rt6_lock);
1833         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1834         read_unlock_bh(&rt6_lock);
1835
1836         *start = buffer;
1837         if (offset)
1838                 *start += offset % RT6_INFO_LEN;
1839
1840         arg.len -= offset % RT6_INFO_LEN;
1841
1842         if (arg.len > length)
1843                 arg.len = length;
1844         if (arg.len < 0)
1845                 arg.len = 0;
1846
1847         return arg.len;
1848 }
1849
1850 extern struct rt6_statistics rt6_stats;
1851
1852 static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length)
1853 {
1854         int len;
1855
1856         len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n",
1857                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1858                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1859                       rt6_stats.fib_rt_cache,
1860                       atomic_read(&ip6_dst_ops.entries));
1861
1862         len -= offset;
1863
1864         if (len > length)
1865                 len = length;
1866         if(len < 0)
1867                 len = 0;
1868
1869         *start = buffer + offset;
1870
1871         return len;
1872 }
1873 #endif  /* CONFIG_PROC_FS */
1874
1875 #ifdef CONFIG_SYSCTL
1876
1877 static int flush_delay;
1878
1879 static
1880 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1881                               void *buffer, size_t *lenp)
1882 {
1883         if (write) {
1884                 proc_dointvec(ctl, write, filp, buffer, lenp);
1885                 if (flush_delay < 0)
1886                         flush_delay = 0;
1887                 fib6_run_gc((unsigned long)flush_delay);
1888                 return 0;
1889         } else
1890                 return -EINVAL;
1891 }
1892
1893 ctl_table ipv6_route_table[] = {
1894         {NET_IPV6_ROUTE_FLUSH, "flush",
1895          &flush_delay, sizeof(int), 0644, NULL,
1896          &ipv6_sysctl_rtcache_flush},
1897         {NET_IPV6_ROUTE_GC_THRESH, "gc_thresh",
1898          &ip6_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1899          &proc_dointvec},
1900         {NET_IPV6_ROUTE_MAX_SIZE, "max_size",
1901          &ip6_rt_max_size, sizeof(int), 0644, NULL,
1902          &proc_dointvec},
1903         {NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1904          &ip6_rt_gc_min_interval, sizeof(int), 0644, NULL,
1905          &proc_dointvec_jiffies, &sysctl_jiffies},
1906         {NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout",
1907          &ip6_rt_gc_timeout, sizeof(int), 0644, NULL,
1908          &proc_dointvec_jiffies, &sysctl_jiffies},
1909         {NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval",
1910          &ip6_rt_gc_interval, sizeof(int), 0644, NULL,
1911          &proc_dointvec_jiffies, &sysctl_jiffies},
1912         {NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity",
1913          &ip6_rt_gc_elasticity, sizeof(int), 0644, NULL,
1914          &proc_dointvec_jiffies, &sysctl_jiffies},
1915         {NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires",
1916          &ip6_rt_mtu_expires, sizeof(int), 0644, NULL,
1917          &proc_dointvec_jiffies, &sysctl_jiffies},
1918         {NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss",
1919          &ip6_rt_min_advmss, sizeof(int), 0644, NULL,
1920          &proc_dointvec_jiffies, &sysctl_jiffies},
1921          {0}
1922 };
1923
1924 #endif
1925
1926
1927 void __init ip6_route_init(void)
1928 {
1929         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
1930                                                      sizeof(struct rt6_info),
1931                                                      0, SLAB_HWCACHE_ALIGN,
1932                                                      NULL, NULL);
1933         fib6_init();
1934 #ifdef  CONFIG_PROC_FS
1935         proc_net_create("ipv6_route", 0, rt6_proc_info);
1936         proc_net_create("rt6_stats", 0, rt6_proc_stats);
1937 #endif
1938 }
1939
1940 #ifdef MODULE
1941 void ip6_route_cleanup(void)
1942 {
1943 #ifdef CONFIG_PROC_FS
1944         proc_net_remove("ipv6_route");
1945         proc_net_remove("rt6_stats");
1946 #endif
1947
1948         rt6_ifdown(NULL);
1949         fib6_gc_cleanup();
1950 }
1951 #endif  /* MODULE */