commented early_printk patch because of rejects.
[linux-flexiantxendom0-3.2.10.git] / net / xfrm / xfrm_policy.c
1 /* 
2  * xfrm_policy.c
3  *
4  * Changes:
5  *      Mitsuru KANDA @USAGI
6  *      Kazunori MIYAZAWA @USAGI
7  *      Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8  *              IPv6 support
9  *      Kazunori MIYAZAWA @USAGI
10  *      YOSHIFUJI Hideaki
11  *              Split up af-specific portion
12  *      Derek Atkins <derek@ihtfp.com>          Add the post_input processor
13  *      
14  */
15
16 #include <linux/config.h>
17 #include <linux/slab.h>
18 #include <linux/kmod.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/workqueue.h>
22 #include <net/xfrm.h>
23 #include <net/ip.h>
24
25 DECLARE_MUTEX(xfrm_cfg_sem);
26
27 static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED;
28
29 struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
30
31 static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED;
32 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
33
34 kmem_cache_t *xfrm_dst_cache;
35
36 static struct work_struct xfrm_policy_gc_work;
37 static struct list_head xfrm_policy_gc_list =
38         LIST_HEAD_INIT(xfrm_policy_gc_list);
39 static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED;
40
41 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
42 {
43         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
44         struct xfrm_type_map *typemap;
45         int err = 0;
46
47         if (unlikely(afinfo == NULL))
48                 return -EAFNOSUPPORT;
49         typemap = afinfo->type_map;
50
51         write_lock(&typemap->lock);
52         if (likely(typemap->map[type->proto] == NULL))
53                 typemap->map[type->proto] = type;
54         else
55                 err = -EEXIST;
56         write_unlock(&typemap->lock);
57         xfrm_policy_put_afinfo(afinfo);
58         return err;
59 }
60
61 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
62 {
63         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
64         struct xfrm_type_map *typemap;
65         int err = 0;
66
67         if (unlikely(afinfo == NULL))
68                 return -EAFNOSUPPORT;
69         typemap = afinfo->type_map;
70
71         write_lock(&typemap->lock);
72         if (unlikely(typemap->map[type->proto] != type))
73                 err = -ENOENT;
74         else
75                 typemap->map[type->proto] = NULL;
76         write_unlock(&typemap->lock);
77         xfrm_policy_put_afinfo(afinfo);
78         return err;
79 }
80
81 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
82 {
83         struct xfrm_policy_afinfo *afinfo;
84         struct xfrm_type_map *typemap;
85         struct xfrm_type *type;
86         int modload_attempted = 0;
87
88 retry:
89         afinfo = xfrm_policy_get_afinfo(family);
90         if (unlikely(afinfo == NULL))
91                 return NULL;
92         typemap = afinfo->type_map;
93
94         read_lock(&typemap->lock);
95         type = typemap->map[proto];
96         if (unlikely(type && !try_module_get(type->owner)))
97                 type = NULL;
98         read_unlock(&typemap->lock);
99         if (!type && !modload_attempted) {
100                 xfrm_policy_put_afinfo(afinfo);
101                 request_module("xfrm-type-%d-%d",
102                                (int) family, (int) proto);
103                 modload_attempted = 1;
104                 goto retry;
105         }
106
107         xfrm_policy_put_afinfo(afinfo);
108         return type;
109 }
110
111 int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 
112                     unsigned short family)
113 {
114         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
115         int err = 0;
116
117         if (unlikely(afinfo == NULL))
118                 return -EAFNOSUPPORT;
119
120         if (likely(afinfo->dst_lookup != NULL))
121                 err = afinfo->dst_lookup(dst, fl);
122         else
123                 err = -EINVAL;
124         xfrm_policy_put_afinfo(afinfo);
125         return err;
126 }
127
128 void xfrm_put_type(struct xfrm_type *type)
129 {
130         module_put(type->owner);
131 }
132
133 static inline unsigned long make_jiffies(long secs)
134 {
135         if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
136                 return MAX_SCHEDULE_TIMEOUT-1;
137         else
138                 return secs*HZ;
139 }
140
141 static void xfrm_policy_timer(unsigned long data)
142 {
143         struct xfrm_policy *xp = (struct xfrm_policy*)data;
144         unsigned long now = (unsigned long)xtime.tv_sec;
145         long next = LONG_MAX;
146         int warn = 0;
147         int dir;
148
149         if (xp->dead)
150                 goto out;
151
152         dir = xp->index & 7;
153
154         if (xp->lft.hard_add_expires_seconds) {
155                 long tmo = xp->lft.hard_add_expires_seconds +
156                         xp->curlft.add_time - now;
157                 if (tmo <= 0)
158                         goto expired;
159                 if (tmo < next)
160                         next = tmo;
161         }
162         if (xp->lft.hard_use_expires_seconds) {
163                 long tmo = xp->lft.hard_use_expires_seconds +
164                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
165                 if (tmo <= 0)
166                         goto expired;
167                 if (tmo < next)
168                         next = tmo;
169         }
170         if (xp->lft.soft_add_expires_seconds) {
171                 long tmo = xp->lft.soft_add_expires_seconds +
172                         xp->curlft.add_time - now;
173                 if (tmo <= 0) {
174                         warn = 1;
175                         tmo = XFRM_KM_TIMEOUT;
176                 }
177                 if (tmo < next)
178                         next = tmo;
179         }
180         if (xp->lft.soft_use_expires_seconds) {
181                 long tmo = xp->lft.soft_use_expires_seconds +
182                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
183                 if (tmo <= 0) {
184                         warn = 1;
185                         tmo = XFRM_KM_TIMEOUT;
186                 }
187                 if (tmo < next)
188                         next = tmo;
189         }
190
191         if (warn)
192                 km_policy_expired(xp, dir, 0);
193         if (next != LONG_MAX &&
194             !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
195                 xfrm_pol_hold(xp);
196
197 out:
198         xfrm_pol_put(xp);
199         return;
200
201 expired:
202         km_policy_expired(xp, dir, 1);
203         xfrm_policy_delete(xp, dir);
204         xfrm_pol_put(xp);
205 }
206
207
208 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
209  * SPD calls.
210  */
211
212 struct xfrm_policy *xfrm_policy_alloc(int gfp)
213 {
214         struct xfrm_policy *policy;
215
216         policy = kmalloc(sizeof(struct xfrm_policy), gfp);
217
218         if (policy) {
219                 memset(policy, 0, sizeof(struct xfrm_policy));
220                 atomic_set(&policy->refcnt, 1);
221                 policy->lock = RW_LOCK_UNLOCKED;
222                 init_timer(&policy->timer);
223                 policy->timer.data = (unsigned long)policy;
224                 policy->timer.function = xfrm_policy_timer;
225         }
226         return policy;
227 }
228
229 /* Destroy xfrm_policy: descendant resources must be released to this moment. */
230
231 void __xfrm_policy_destroy(struct xfrm_policy *policy)
232 {
233         if (!policy->dead)
234                 BUG();
235
236         if (policy->bundles)
237                 BUG();
238
239         if (del_timer(&policy->timer))
240                 BUG();
241
242         kfree(policy);
243 }
244
245 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
246 {
247         struct dst_entry *dst;
248
249         while ((dst = policy->bundles) != NULL) {
250                 policy->bundles = dst->next;
251                 dst_free(dst);
252         }
253
254         if (del_timer(&policy->timer))
255                 atomic_dec(&policy->refcnt);
256
257         if (atomic_read(&policy->refcnt) > 1)
258                 flow_cache_flush();
259
260         xfrm_pol_put(policy);
261 }
262
263 static void xfrm_policy_gc_task(void *data)
264 {
265         struct xfrm_policy *policy;
266         struct list_head *entry, *tmp;
267         struct list_head gc_list = LIST_HEAD_INIT(gc_list);
268
269         spin_lock_bh(&xfrm_policy_gc_lock);
270         list_splice_init(&xfrm_policy_gc_list, &gc_list);
271         spin_unlock_bh(&xfrm_policy_gc_lock);
272
273         list_for_each_safe(entry, tmp, &gc_list) {
274                 policy = list_entry(entry, struct xfrm_policy, list);
275                 xfrm_policy_gc_kill(policy);
276         }
277 }
278
279 /* Rule must be locked. Release descentant resources, announce
280  * entry dead. The rule must be unlinked from lists to the moment.
281  */
282
283 void xfrm_policy_kill(struct xfrm_policy *policy)
284 {
285         write_lock_bh(&policy->lock);
286         if (policy->dead)
287                 goto out;
288
289         policy->dead = 1;
290
291         spin_lock(&xfrm_policy_gc_lock);
292         list_add(&policy->list, &xfrm_policy_gc_list);
293         spin_unlock(&xfrm_policy_gc_lock);
294         schedule_work(&xfrm_policy_gc_work);
295
296 out:
297         write_unlock_bh(&policy->lock);
298 }
299
300 /* Generate new index... KAME seems to generate them ordered by cost
301  * of an absolute inpredictability of ordering of rules. This will not pass. */
302 static u32 xfrm_gen_index(int dir)
303 {
304         u32 idx;
305         struct xfrm_policy *p;
306         static u32 idx_generator;
307
308         for (;;) {
309                 idx = (idx_generator | dir);
310                 idx_generator += 8;
311                 if (idx == 0)
312                         idx = 8;
313                 for (p = xfrm_policy_list[dir]; p; p = p->next) {
314                         if (p->index == idx)
315                                 break;
316                 }
317                 if (!p)
318                         return idx;
319         }
320 }
321
322 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
323 {
324         struct xfrm_policy *pol, **p;
325         struct xfrm_policy *delpol = NULL;
326         struct xfrm_policy **newpos = NULL;
327
328         write_lock_bh(&xfrm_policy_lock);
329         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
330                 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
331                         if (excl) {
332                                 write_unlock_bh(&xfrm_policy_lock);
333                                 return -EEXIST;
334                         }
335                         *p = pol->next;
336                         delpol = pol;
337                         if (policy->priority > pol->priority)
338                                 continue;
339                 } else if (policy->priority >= pol->priority)
340                         continue;
341                 if (!newpos)
342                         newpos = p;
343                 if (delpol)
344                         break;
345         }
346         if (newpos)
347                 p = newpos;
348         xfrm_pol_hold(policy);
349         policy->next = *p;
350         *p = policy;
351         atomic_inc(&flow_cache_genid);
352         policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
353         policy->curlft.add_time = (unsigned long)xtime.tv_sec;
354         policy->curlft.use_time = 0;
355         if (!mod_timer(&policy->timer, jiffies + HZ))
356                 xfrm_pol_hold(policy);
357         write_unlock_bh(&xfrm_policy_lock);
358
359         if (delpol) {
360                 xfrm_policy_kill(delpol);
361         }
362         return 0;
363 }
364
365 struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
366                                       int delete)
367 {
368         struct xfrm_policy *pol, **p;
369
370         write_lock_bh(&xfrm_policy_lock);
371         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
372                 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
373                         xfrm_pol_hold(pol);
374                         if (delete)
375                                 *p = pol->next;
376                         break;
377                 }
378         }
379         write_unlock_bh(&xfrm_policy_lock);
380
381         if (pol && delete) {
382                 atomic_inc(&flow_cache_genid);
383                 xfrm_policy_kill(pol);
384         }
385         return pol;
386 }
387
388 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
389 {
390         struct xfrm_policy *pol, **p;
391
392         write_lock_bh(&xfrm_policy_lock);
393         for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
394                 if (pol->index == id) {
395                         xfrm_pol_hold(pol);
396                         if (delete)
397                                 *p = pol->next;
398                         break;
399                 }
400         }
401         write_unlock_bh(&xfrm_policy_lock);
402
403         if (pol && delete) {
404                 atomic_inc(&flow_cache_genid);
405                 xfrm_policy_kill(pol);
406         }
407         return pol;
408 }
409
410 void xfrm_policy_flush(void)
411 {
412         struct xfrm_policy *xp;
413         int dir;
414
415         write_lock_bh(&xfrm_policy_lock);
416         for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
417                 while ((xp = xfrm_policy_list[dir]) != NULL) {
418                         xfrm_policy_list[dir] = xp->next;
419                         write_unlock_bh(&xfrm_policy_lock);
420
421                         xfrm_policy_kill(xp);
422
423                         write_lock_bh(&xfrm_policy_lock);
424                 }
425         }
426         atomic_inc(&flow_cache_genid);
427         write_unlock_bh(&xfrm_policy_lock);
428 }
429
430 int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
431                      void *data)
432 {
433         struct xfrm_policy *xp;
434         int dir;
435         int count = 0;
436         int error = 0;
437
438         read_lock_bh(&xfrm_policy_lock);
439         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
440                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
441                         count++;
442         }
443
444         if (count == 0) {
445                 error = -ENOENT;
446                 goto out;
447         }
448
449         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
450                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
451                         error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
452                         if (error)
453                                 goto out;
454                 }
455         }
456
457 out:
458         read_unlock_bh(&xfrm_policy_lock);
459         return error;
460 }
461
462
463 /* Find policy to apply to this flow. */
464
465 static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
466                                void **objp, atomic_t **obj_refp)
467 {
468         struct xfrm_policy *pol;
469
470         read_lock_bh(&xfrm_policy_lock);
471         for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
472                 struct xfrm_selector *sel = &pol->selector;
473                 int match;
474
475                 if (pol->family != family)
476                         continue;
477
478                 match = xfrm_selector_match(sel, fl, family);
479                 if (match) {
480                         xfrm_pol_hold(pol);
481                         break;
482                 }
483         }
484         read_unlock_bh(&xfrm_policy_lock);
485         if ((*objp = (void *) pol) != NULL)
486                 *obj_refp = &pol->refcnt;
487 }
488
489 struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
490 {
491         struct xfrm_policy *pol;
492
493         read_lock_bh(&xfrm_policy_lock);
494         if ((pol = sk->sk_policy[dir]) != NULL) {
495                 int match = xfrm_selector_match(&pol->selector, fl,
496                                                 sk->sk_family);
497                 if (match)
498                         xfrm_pol_hold(pol);
499                 else
500                         pol = NULL;
501         }
502         read_unlock_bh(&xfrm_policy_lock);
503         return pol;
504 }
505
506 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
507 {
508         pol->next = xfrm_policy_list[dir];
509         xfrm_policy_list[dir] = pol;
510         xfrm_pol_hold(pol);
511 }
512
513 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
514                                                 int dir)
515 {
516         struct xfrm_policy **polp;
517
518         for (polp = &xfrm_policy_list[dir];
519              *polp != NULL; polp = &(*polp)->next) {
520                 if (*polp == pol) {
521                         *polp = pol->next;
522                         atomic_dec(&pol->refcnt);
523                         return pol;
524                 }
525         }
526         return NULL;
527 }
528
529 void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
530 {
531         write_lock_bh(&xfrm_policy_lock);
532         pol = __xfrm_policy_unlink(pol, dir);
533         write_unlock_bh(&xfrm_policy_lock);
534         if (pol)
535                 xfrm_policy_kill(pol);
536 }
537
538 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
539 {
540         struct xfrm_policy *old_pol;
541
542         write_lock_bh(&xfrm_policy_lock);
543         old_pol = sk->sk_policy[dir];
544         sk->sk_policy[dir] = pol;
545         if (pol) {
546                 pol->curlft.add_time = (unsigned long)xtime.tv_sec;
547                 pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
548                 __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
549         }
550         if (old_pol)
551                 __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
552         write_unlock_bh(&xfrm_policy_lock);
553
554         if (old_pol) {
555                 xfrm_policy_kill(old_pol);
556         }
557         return 0;
558 }
559
560 static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
561 {
562         struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
563
564         if (newp) {
565                 newp->selector = old->selector;
566                 newp->lft = old->lft;
567                 newp->curlft = old->curlft;
568                 newp->action = old->action;
569                 newp->flags = old->flags;
570                 newp->xfrm_nr = old->xfrm_nr;
571                 newp->index = old->index;
572                 memcpy(newp->xfrm_vec, old->xfrm_vec,
573                        newp->xfrm_nr*sizeof(struct xfrm_tmpl));
574                 write_lock_bh(&xfrm_policy_lock);
575                 __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
576                 write_unlock_bh(&xfrm_policy_lock);
577         }
578         return newp;
579 }
580
581 int __xfrm_sk_clone_policy(struct sock *sk)
582 {
583         struct xfrm_policy *p0 = sk->sk_policy[0],
584                            *p1 = sk->sk_policy[1];
585
586         sk->sk_policy[0] = sk->sk_policy[1] = NULL;
587         if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
588                 return -ENOMEM;
589         if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
590                 return -ENOMEM;
591         return 0;
592 }
593
594 /* Resolve list of templates for the flow, given policy. */
595
596 static int
597 xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
598                   struct xfrm_state **xfrm,
599                   unsigned short family)
600 {
601         int nx;
602         int i, error;
603         xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
604         xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
605
606         for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
607                 struct xfrm_state *x;
608                 xfrm_address_t *remote = daddr;
609                 xfrm_address_t *local  = saddr;
610                 struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
611
612                 if (tmpl->mode) {
613                         remote = &tmpl->id.daddr;
614                         local = &tmpl->saddr;
615                 }
616
617                 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
618
619                 if (x && x->km.state == XFRM_STATE_VALID) {
620                         xfrm[nx++] = x;
621                         daddr = remote;
622                         saddr = local;
623                         continue;
624                 }
625                 if (x) {
626                         error = (x->km.state == XFRM_STATE_ERROR ?
627                                  -EINVAL : -EAGAIN);
628                         xfrm_state_put(x);
629                 }
630
631                 if (!tmpl->optional)
632                         goto fail;
633         }
634         return nx;
635
636 fail:
637         for (nx--; nx>=0; nx--)
638                 xfrm_state_put(xfrm[nx]);
639         return error;
640 }
641
642 /* Check that the bundle accepts the flow and its components are
643  * still valid.
644  */
645
646 static struct dst_entry *
647 xfrm_find_bundle(struct flowi *fl, struct rtable *rt, struct xfrm_policy *policy, unsigned short family)
648 {
649         struct dst_entry *x;
650         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
651         if (unlikely(afinfo == NULL))
652                 return ERR_PTR(-EINVAL);
653         x = afinfo->find_bundle(fl, rt, policy);
654         xfrm_policy_put_afinfo(afinfo);
655         return x;
656 }
657
658 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
659  * all the metrics... Shortly, bundle a bundle.
660  */
661
662 static int
663 xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
664                    struct flowi *fl, struct dst_entry **dst_p,
665                    unsigned short family)
666 {
667         int err;
668         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
669         if (unlikely(afinfo == NULL))
670                 return -EINVAL;
671         err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
672         xfrm_policy_put_afinfo(afinfo);
673         return err;
674 }
675
676 static inline int policy_to_flow_dir(int dir)
677 {
678         if (XFRM_POLICY_IN == FLOW_DIR_IN &&
679             XFRM_POLICY_OUT == FLOW_DIR_OUT &&
680             XFRM_POLICY_FWD == FLOW_DIR_FWD)
681                 return dir;
682         switch (dir) {
683         default:
684         case XFRM_POLICY_IN:
685                 return FLOW_DIR_IN;
686         case XFRM_POLICY_OUT:
687                 return FLOW_DIR_OUT;
688         case XFRM_POLICY_FWD:
689                 return FLOW_DIR_FWD;
690         };
691 }
692
693 /* Main function: finds/creates a bundle for given flow.
694  *
695  * At the moment we eat a raw IP route. Mostly to speed up lookups
696  * on interfaces with disabled IPsec.
697  */
698 int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
699                 struct sock *sk, int flags)
700 {
701         struct xfrm_policy *policy;
702         struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
703         struct rtable *rt = (struct rtable*)*dst_p;
704         struct dst_entry *dst;
705         int nx = 0;
706         int err;
707         u32 genid;
708         u16 family = (*dst_p)->ops->family;
709
710         switch (family) {
711         case AF_INET:
712                 if (!fl->fl4_src)
713                         fl->fl4_src = rt->rt_src;
714                 if (!fl->fl4_dst)
715                         fl->fl4_dst = rt->rt_dst;
716         case AF_INET6:
717                 /* Still not clear... */
718         default:
719                 /* nothing */;
720         }
721
722 restart:
723         genid = atomic_read(&flow_cache_genid);
724         policy = NULL;
725         if (sk && sk->sk_policy[1])
726                 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
727
728         if (!policy) {
729                 /* To accelerate a bit...  */
730                 if ((rt->u.dst.flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
731                         return 0;
732
733                 policy = flow_cache_lookup(fl, family,
734                                            policy_to_flow_dir(XFRM_POLICY_OUT),
735                                            xfrm_policy_lookup);
736         }
737
738         if (!policy)
739                 return 0;
740
741         policy->curlft.use_time = (unsigned long)xtime.tv_sec;
742
743         switch (policy->action) {
744         case XFRM_POLICY_BLOCK:
745                 /* Prohibit the flow */
746                 xfrm_pol_put(policy);
747                 return -EPERM;
748
749         case XFRM_POLICY_ALLOW:
750                 if (policy->xfrm_nr == 0) {
751                         /* Flow passes not transformed. */
752                         xfrm_pol_put(policy);
753                         return 0;
754                 }
755
756                 /* Try to find matching bundle.
757                  *
758                  * LATER: help from flow cache. It is optional, this
759                  * is required only for output policy.
760                  */
761                 dst = xfrm_find_bundle(fl, rt, policy, family);
762                 if (IS_ERR(dst)) {
763                         xfrm_pol_put(policy);
764                         return PTR_ERR(dst);
765                 }
766
767                 if (dst)
768                         break;
769
770                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
771
772                 if (unlikely(nx<0)) {
773                         err = nx;
774                         if (err == -EAGAIN) {
775                                 struct task_struct *tsk = current;
776                                 DECLARE_WAITQUEUE(wait, tsk);
777                                 if (!flags)
778                                         goto error;
779
780                                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
781                                 add_wait_queue(&km_waitq, &wait);
782                                 err = xfrm_tmpl_resolve(policy, fl, xfrm, family);
783                                 if (err == -EAGAIN)
784                                         schedule();
785                                 __set_task_state(tsk, TASK_RUNNING);
786                                 remove_wait_queue(&km_waitq, &wait);
787
788                                 if (err == -EAGAIN && signal_pending(current)) {
789                                         err = -ERESTART;
790                                         goto error;
791                                 }
792                                 if (err == -EAGAIN ||
793                                     genid != atomic_read(&flow_cache_genid)) {
794                                         xfrm_pol_put(policy);
795                                         goto restart;
796                                 }
797                         }
798                         if (err)
799                                 goto error;
800                 } else if (nx == 0) {
801                         /* Flow passes not transformed. */
802                         xfrm_pol_put(policy);
803                         return 0;
804                 }
805
806                 dst = &rt->u.dst;
807                 err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
808
809                 if (unlikely(err)) {
810                         int i;
811                         for (i=0; i<nx; i++)
812                                 xfrm_state_put(xfrm[i]);
813                         goto error;
814                 }
815
816                 write_lock_bh(&policy->lock);
817                 if (unlikely(policy->dead)) {
818                         /* Wow! While we worked on resolving, this
819                          * policy has gone. Retry. It is not paranoia,
820                          * we just cannot enlist new bundle to dead object.
821                          */
822                         write_unlock_bh(&policy->lock);
823
824                         xfrm_pol_put(policy);
825                         if (dst)
826                                 dst_free(dst);
827                         goto restart;
828                 }
829                 dst->next = policy->bundles;
830                 policy->bundles = dst;
831                 dst_hold(dst);
832                 write_unlock_bh(&policy->lock);
833         }
834         *dst_p = dst;
835         ip_rt_put(rt);
836         xfrm_pol_put(policy);
837         return 0;
838
839 error:
840         ip_rt_put(rt);
841         xfrm_pol_put(policy);
842         *dst_p = NULL;
843         return err;
844 }
845
846 /* When skb is transformed back to its "native" form, we have to
847  * check policy restrictions. At the moment we make this in maximally
848  * stupid way. Shame on me. :-) Of course, connected sockets must
849  * have policy cached at them.
850  */
851
852 static inline int
853 xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, 
854               unsigned short family)
855 {
856         return  x->id.proto == tmpl->id.proto &&
857                 (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
858                 (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
859                 x->props.mode == tmpl->mode &&
860                 (tmpl->aalgos & (1<<x->props.aalgo)) &&
861                 !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
862 }
863
864 static inline int
865 xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int idx,
866                unsigned short family)
867 {
868         for (; idx < sp->len; idx++) {
869                 if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family))
870                         return ++idx;
871         }
872         return -1;
873 }
874
875 static int
876 _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
877 {
878         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
879
880         if (unlikely(afinfo == NULL))
881                 return -EAFNOSUPPORT;
882
883         afinfo->decode_session(skb, fl);
884         xfrm_policy_put_afinfo(afinfo);
885         return 0;
886 }
887
888 int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, 
889                         unsigned short family)
890 {
891         struct xfrm_policy *pol;
892         struct flowi fl;
893
894         if (_decode_session(skb, &fl, family) < 0)
895                 return 0;
896
897         /* First, check used SA against their selectors. */
898         if (skb->sp) {
899                 int i;
900
901                 for (i=skb->sp->len-1; i>=0; i--) {
902                   struct sec_decap_state *xvec = &(skb->sp->x[i]);
903                         if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
904                                 return 0;
905
906                         /* If there is a post_input processor, try running it */
907                         if (xvec->xvec->type->post_input &&
908                             (xvec->xvec->type->post_input)(xvec->xvec,
909                                                            &(xvec->decap),
910                                                            skb) != 0)
911                                 return 0;
912                 }
913         }
914
915         pol = NULL;
916         if (sk && sk->sk_policy[dir])
917                 pol = xfrm_sk_policy_lookup(sk, dir, &fl);
918
919         if (!pol)
920                 pol = flow_cache_lookup(&fl, family,
921                                         policy_to_flow_dir(dir),
922                                         xfrm_policy_lookup);
923
924         if (!pol)
925                 return 1;
926
927         pol->curlft.use_time = (unsigned long)xtime.tv_sec;
928
929         if (pol->action == XFRM_POLICY_ALLOW) {
930                 if (pol->xfrm_nr != 0) {
931                         struct sec_path *sp;
932                         static struct sec_path dummy;
933                         int i, k;
934
935                         if ((sp = skb->sp) == NULL)
936                                 sp = &dummy;
937
938                         /* For each tmpl search corresponding xfrm.
939                          * Order is _important_. Later we will implement
940                          * some barriers, but at the moment barriers
941                          * are implied between each two transformations.
942                          */
943                         for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
944                                 if (pol->xfrm_vec[i].optional)
945                                         continue;
946                                 k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
947                                 if (k < 0)
948                                         goto reject;
949                         }
950                 }
951                 xfrm_pol_put(pol);
952                 return 1;
953         }
954
955 reject:
956         xfrm_pol_put(pol);
957         return 0;
958 }
959
960 int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
961 {
962         struct flowi fl;
963
964         if (_decode_session(skb, &fl, family) < 0)
965                 return 0;
966
967         return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
968 }
969
970 /* Optimize later using cookies and generation ids. */
971
972 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
973 {
974         struct dst_entry *child = dst;
975
976         while (child) {
977                 if (child->obsolete > 0 ||
978                     (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
979                         dst_release(dst);
980                         return NULL;
981                 }
982                 child = child->child;
983         }
984
985         return dst;
986 }
987
988 static void xfrm_dst_destroy(struct dst_entry *dst)
989 {
990         xfrm_state_put(dst->xfrm);
991         dst->xfrm = NULL;
992 }
993
994 static void xfrm_link_failure(struct sk_buff *skb)
995 {
996         /* Impossible. Such dst must be popped before reaches point of failure. */
997         return;
998 }
999
1000 static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1001 {
1002         if (dst) {
1003                 if (dst->obsolete) {
1004                         dst_release(dst);
1005                         dst = NULL;
1006                 }
1007         }
1008         return dst;
1009 }
1010
1011 static void __xfrm_garbage_collect(void)
1012 {
1013         int i;
1014         struct xfrm_policy *pol;
1015         struct dst_entry *dst, **dstp, *gc_list = NULL;
1016
1017         read_lock_bh(&xfrm_policy_lock);
1018         for (i=0; i<2*XFRM_POLICY_MAX; i++) {
1019                 for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
1020                         write_lock(&pol->lock);
1021                         dstp = &pol->bundles;
1022                         while ((dst=*dstp) != NULL) {
1023                                 if (atomic_read(&dst->__refcnt) == 0) {
1024                                         *dstp = dst->next;
1025                                         dst->next = gc_list;
1026                                         gc_list = dst;
1027                                 } else {
1028                                         dstp = &dst->next;
1029                                 }
1030                         }
1031                         write_unlock(&pol->lock);
1032                 }
1033         }
1034         read_unlock_bh(&xfrm_policy_lock);
1035
1036         while (gc_list) {
1037                 dst = gc_list;
1038                 gc_list = dst->next;
1039                 dst_free(dst);
1040         }
1041 }
1042
1043 static int bundle_depends_on(struct dst_entry *dst, struct xfrm_state *x)
1044 {
1045         do {
1046                 if (dst->xfrm == x)
1047                         return 1;
1048         } while ((dst = dst->child) != NULL);
1049         return 0;
1050 }
1051
1052 int xfrm_flush_bundles(struct xfrm_state *x)
1053 {
1054         int i;
1055         struct xfrm_policy *pol;
1056         struct dst_entry *dst, **dstp, *gc_list = NULL;
1057
1058         read_lock_bh(&xfrm_policy_lock);
1059         for (i=0; i<2*XFRM_POLICY_MAX; i++) {
1060                 for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
1061                         write_lock(&pol->lock);
1062                         dstp = &pol->bundles;
1063                         while ((dst=*dstp) != NULL) {
1064                                 if (bundle_depends_on(dst, x)) {
1065                                         *dstp = dst->next;
1066                                         dst->next = gc_list;
1067                                         gc_list = dst;
1068                                 } else {
1069                                         dstp = &dst->next;
1070                                 }
1071                         }
1072                         write_unlock(&pol->lock);
1073                 }
1074         }
1075         read_unlock_bh(&xfrm_policy_lock);
1076
1077         while (gc_list) {
1078                 dst = gc_list;
1079                 gc_list = dst->next;
1080                 dst_free(dst);
1081         }
1082
1083         return 0;
1084 }
1085
1086 /* Well... that's _TASK_. We need to scan through transformation
1087  * list and figure out what mss tcp should generate in order to
1088  * final datagram fit to mtu. Mama mia... :-)
1089  *
1090  * Apparently, some easy way exists, but we used to choose the most
1091  * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
1092  *
1093  * Consider this function as something like dark humour. :-)
1094  */
1095 static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
1096 {
1097         int res = mtu - dst->header_len;
1098
1099         for (;;) {
1100                 struct dst_entry *d = dst;
1101                 int m = res;
1102
1103                 do {
1104                         struct xfrm_state *x = d->xfrm;
1105                         if (x) {
1106                                 spin_lock_bh(&x->lock);
1107                                 if (x->km.state == XFRM_STATE_VALID &&
1108                                     x->type && x->type->get_max_size)
1109                                         m = x->type->get_max_size(d->xfrm, m);
1110                                 else
1111                                         m += x->props.header_len;
1112                                 spin_unlock_bh(&x->lock);
1113                         }
1114                 } while ((d = d->child) != NULL);
1115
1116                 if (m <= mtu)
1117                         break;
1118                 res -= (m - mtu);
1119                 if (res < 88)
1120                         return mtu;
1121         }
1122
1123         return res + dst->header_len;
1124 }
1125
1126 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1127 {
1128         int err = 0;
1129         if (unlikely(afinfo == NULL))
1130                 return -EINVAL;
1131         if (unlikely(afinfo->family >= NPROTO))
1132                 return -EAFNOSUPPORT;
1133         write_lock(&xfrm_policy_afinfo_lock);
1134         if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1135                 err = -ENOBUFS;
1136         else {
1137                 struct dst_ops *dst_ops = afinfo->dst_ops;
1138                 if (likely(dst_ops->kmem_cachep == NULL))
1139                         dst_ops->kmem_cachep = xfrm_dst_cache;
1140                 if (likely(dst_ops->check == NULL))
1141                         dst_ops->check = xfrm_dst_check;
1142                 if (likely(dst_ops->destroy == NULL))
1143                         dst_ops->destroy = xfrm_dst_destroy;
1144                 if (likely(dst_ops->negative_advice == NULL))
1145                         dst_ops->negative_advice = xfrm_negative_advice;
1146                 if (likely(dst_ops->link_failure == NULL))
1147                         dst_ops->link_failure = xfrm_link_failure;
1148                 if (likely(dst_ops->get_mss == NULL))
1149                         dst_ops->get_mss = xfrm_get_mss;
1150                 if (likely(afinfo->garbage_collect == NULL))
1151                         afinfo->garbage_collect = __xfrm_garbage_collect;
1152                 xfrm_policy_afinfo[afinfo->family] = afinfo;
1153         }
1154         write_unlock(&xfrm_policy_afinfo_lock);
1155         return err;
1156 }
1157
1158 int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1159 {
1160         int err = 0;
1161         if (unlikely(afinfo == NULL))
1162                 return -EINVAL;
1163         if (unlikely(afinfo->family >= NPROTO))
1164                 return -EAFNOSUPPORT;
1165         write_lock(&xfrm_policy_afinfo_lock);
1166         if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1167                 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1168                         err = -EINVAL;
1169                 else {
1170                         struct dst_ops *dst_ops = afinfo->dst_ops;
1171                         xfrm_policy_afinfo[afinfo->family] = NULL;
1172                         dst_ops->kmem_cachep = NULL;
1173                         dst_ops->check = NULL;
1174                         dst_ops->destroy = NULL;
1175                         dst_ops->negative_advice = NULL;
1176                         dst_ops->link_failure = NULL;
1177                         dst_ops->get_mss = NULL;
1178                         afinfo->garbage_collect = NULL;
1179                 }
1180         }
1181         write_unlock(&xfrm_policy_afinfo_lock);
1182         return err;
1183 }
1184
1185 struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1186 {
1187         struct xfrm_policy_afinfo *afinfo;
1188         if (unlikely(family >= NPROTO))
1189                 return NULL;
1190         read_lock(&xfrm_policy_afinfo_lock);
1191         afinfo = xfrm_policy_afinfo[family];
1192         if (likely(afinfo != NULL))
1193                 read_lock(&afinfo->lock);
1194         read_unlock(&xfrm_policy_afinfo_lock);
1195         return afinfo;
1196 }
1197
1198 void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
1199 {
1200         if (unlikely(afinfo == NULL))
1201                 return;
1202         read_unlock(&afinfo->lock);
1203 }
1204
1205 void __init xfrm_policy_init(void)
1206 {
1207         xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
1208                                            sizeof(struct xfrm_dst),
1209                                            0, SLAB_HWCACHE_ALIGN,
1210                                            NULL, NULL);
1211         if (!xfrm_dst_cache)
1212                 panic("XFRM: failed to allocate xfrm_dst_cache\n");
1213
1214         INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
1215 }
1216
1217 void __init xfrm_init(void)
1218 {
1219         xfrm_state_init();
1220         xfrm_policy_init();
1221         xfrm_input_init();
1222 }
1223