commented early_printk patch because of rejects.
[linux-flexiantxendom0-3.2.10.git] / net / ipv4 / netfilter / ip_nat_core.c
1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
4    Public Licence. */
5 #include <linux/version.h>
6 #include <linux/module.h>
7 #include <linux/types.h>
8 #include <linux/timer.h>
9 #include <linux/skbuff.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <linux/vmalloc.h>
12 #include <net/checksum.h>
13 #include <net/icmp.h>
14 #include <net/ip.h>
15 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
16 #include <linux/icmp.h>
17 #include <linux/udp.h>
18
19 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
20 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
21
22 #include <linux/netfilter_ipv4/ip_conntrack.h>
23 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
24 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
25 #include <linux/netfilter_ipv4/ip_nat.h>
26 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
27 #include <linux/netfilter_ipv4/ip_nat_core.h>
28 #include <linux/netfilter_ipv4/ip_nat_helper.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
30 #include <linux/netfilter_ipv4/listhelp.h>
31
32 #if 0
33 #define DEBUGP printk
34 #else
35 #define DEBUGP(format, args...)
36 #endif
37
38 DECLARE_RWLOCK(ip_nat_lock);
39 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
40
41 /* Calculated at init based on memory size */
42 static unsigned int ip_nat_htable_size;
43
44 static struct list_head *bysource;
45 static struct list_head *byipsproto;
46 LIST_HEAD(protos);
47 LIST_HEAD(helpers);
48
49 extern struct ip_nat_protocol unknown_nat_protocol;
50
51 /* We keep extra hashes for each conntrack, for fast searching. */
52 static inline size_t
53 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
54 {
55         /* Modified src and dst, to ensure we don't create two
56            identical streams. */
57         return (src + dst + proto) % ip_nat_htable_size;
58 }
59
60 static inline size_t
61 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
62 {
63         /* Original src, to ensure we map it consistently if poss. */
64         return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
65 }
66
67 /* Noone using conntrack by the time this called. */
68 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
69 {
70         struct ip_nat_info *info = &conn->nat.info;
71
72         if (!info->initialized)
73                 return;
74
75         IP_NF_ASSERT(info->bysource.conntrack);
76         IP_NF_ASSERT(info->byipsproto.conntrack);
77
78         WRITE_LOCK(&ip_nat_lock);
79         LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
80                                           .tuple.src,
81                                           conn->tuplehash[IP_CT_DIR_ORIGINAL]
82                                           .tuple.dst.protonum)],
83                     &info->bysource);
84
85         LIST_DELETE(&byipsproto
86                     [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
87                                       .tuple.src.ip,
88                                       conn->tuplehash[IP_CT_DIR_REPLY]
89                                       .tuple.dst.ip,
90                                       conn->tuplehash[IP_CT_DIR_REPLY]
91                                       .tuple.dst.protonum)],
92                     &info->byipsproto);
93         WRITE_UNLOCK(&ip_nat_lock);
94 }
95
96 /* We do checksum mangling, so if they were wrong before they're still
97  * wrong.  Also works for incomplete packets (eg. ICMP dest
98  * unreachables.) */
99 u_int16_t
100 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
101 {
102         u_int32_t diffs[] = { oldvalinv, newval };
103         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
104                                       oldcheck^0xFFFF));
105 }
106
107 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
108 {
109         return i->protonum == proto;
110 }
111
112 struct ip_nat_protocol *
113 find_nat_proto(u_int16_t protonum)
114 {
115         struct ip_nat_protocol *i;
116
117         MUST_BE_READ_LOCKED(&ip_nat_lock);
118         i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
119         if (!i)
120                 i = &unknown_nat_protocol;
121         return i;
122 }
123
124 /* Is this tuple already taken? (not by us) */
125 int
126 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
127                   const struct ip_conntrack *ignored_conntrack)
128 {
129         /* Conntrack tracking doesn't keep track of outgoing tuples; only
130            incoming ones.  NAT means they don't have a fixed mapping,
131            so we invert the tuple and look for the incoming reply.
132
133            We could keep a separate hash if this proves too slow. */
134         struct ip_conntrack_tuple reply;
135
136         invert_tuplepr(&reply, tuple);
137         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
138 }
139
140 /* Does tuple + the source manip come within the range mr */
141 static int
142 in_range(const struct ip_conntrack_tuple *tuple,
143          const struct ip_conntrack_manip *manip,
144          const struct ip_nat_multi_range *mr)
145 {
146         struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
147         unsigned int i;
148         struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
149
150         for (i = 0; i < mr->rangesize; i++) {
151                 /* If we are allowed to map IPs, then we must be in the
152                    range specified, otherwise we must be unchanged. */
153                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
154                         if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
155                             || (ntohl(newtuple.src.ip)
156                                 > ntohl(mr->range[i].max_ip)))
157                                 continue;
158                 } else {
159                         if (newtuple.src.ip != tuple->src.ip)
160                                 continue;
161                 }
162
163                 if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
164                     && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
165                                        &mr->range[i].min, &mr->range[i].max))
166                         return 1;
167         }
168         return 0;
169 }
170
171 static inline int
172 src_cmp(const struct ip_nat_hash *i,
173         const struct ip_conntrack_tuple *tuple,
174         const struct ip_nat_multi_range *mr)
175 {
176         return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
177                 == tuple->dst.protonum
178                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
179                 == tuple->src.ip
180                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
181                 == tuple->src.u.all
182                 && in_range(tuple,
183                             &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
184                             .tuple.src,
185                             mr));
186 }
187
188 /* Only called for SRC manip */
189 static struct ip_conntrack_manip *
190 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
191                      const struct ip_nat_multi_range *mr)
192 {
193         unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
194         struct ip_nat_hash *i;
195
196         MUST_BE_READ_LOCKED(&ip_nat_lock);
197         i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
198         if (i)
199                 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
200         else
201                 return NULL;
202 }
203
204 #ifdef CONFIG_IP_NF_NAT_LOCAL
205 /* If it's really a local destination manip, it may need to do a
206    source manip too. */
207 static int
208 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
209 {
210         struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
211         struct rtable *rt;
212
213         /* FIXME: IPTOS_TOS(iph->tos) --RR */
214         if (ip_route_output_key(&rt, &fl) != 0) {
215                 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
216                        NIPQUAD(var_ip));
217                 return 0;
218         }
219
220         *other_ipp = rt->rt_src;
221         ip_rt_put(rt);
222         return 1;
223 }
224 #endif
225
226 /* Simple way to iterate through all. */
227 static inline int fake_cmp(const struct ip_nat_hash *i,
228                            u_int32_t src, u_int32_t dst, u_int16_t protonum,
229                            unsigned int *score,
230                            const struct ip_conntrack *conntrack)
231 {
232         /* Compare backwards: we're dealing with OUTGOING tuples, and
233            inside the conntrack is the REPLY tuple.  Don't count this
234            conntrack. */
235         if (i->conntrack != conntrack
236             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
237             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
238             && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
239                 == protonum))
240                 (*score)++;
241         return 0;
242 }
243
244 static inline unsigned int
245 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
246            const struct ip_conntrack *conntrack)
247 {
248         unsigned int score = 0;
249
250         MUST_BE_READ_LOCKED(&ip_nat_lock);
251         LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
252                   fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
253                   conntrack);
254
255         return score;
256 }
257
258 /* For [FUTURE] fragmentation handling, we want the least-used
259    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
260    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
261    1-65535, we don't do pro-rata allocation based on ports; we choose
262    the ip with the lowest src-ip/dst-ip/proto usage.
263
264    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
265    range), we eliminate that and try again.  This is not the most
266    efficient approach, but if you're worried about that, don't hand us
267    ranges you don't really have.  */
268 static struct ip_nat_range *
269 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
270                     const struct ip_nat_multi_range *mr,
271                     const struct ip_conntrack *conntrack,
272                     unsigned int hooknum)
273 {
274         unsigned int i;
275         struct {
276                 const struct ip_nat_range *range;
277                 unsigned int score;
278                 struct ip_conntrack_tuple tuple;
279         } best = { NULL,  0xFFFFFFFF };
280         u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
281         static unsigned int randomness;
282
283         if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
284                 var_ipp = &tuple->src.ip;
285                 saved_ip = tuple->dst.ip;
286                 other_ipp = &tuple->dst.ip;
287         } else {
288                 var_ipp = &tuple->dst.ip;
289                 saved_ip = tuple->src.ip;
290                 other_ipp = &tuple->src.ip;
291         }
292         /* Don't do do_extra_mangle unless necessary (overrides
293            explicit socket bindings, for example) */
294         orig_dstip = tuple->dst.ip;
295
296         IP_NF_ASSERT(mr->rangesize >= 1);
297         for (i = 0; i < mr->rangesize; i++) {
298                 /* Host order */
299                 u_int32_t minip, maxip, j;
300
301                 /* Don't do ranges which are already eliminated. */
302                 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
303                         continue;
304                 }
305
306                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
307                         minip = ntohl(mr->range[i].min_ip);
308                         maxip = ntohl(mr->range[i].max_ip);
309                 } else
310                         minip = maxip = ntohl(*var_ipp);
311
312                 randomness++;
313                 for (j = 0; j < maxip - minip + 1; j++) {
314                         unsigned int score;
315
316                         *var_ipp = htonl(minip + (randomness + j) 
317                                          % (maxip - minip + 1));
318
319                         /* Reset the other ip in case it was mangled by
320                          * do_extra_mangle last time. */
321                         *other_ipp = saved_ip;
322
323 #ifdef CONFIG_IP_NF_NAT_LOCAL
324                         if (hooknum == NF_IP_LOCAL_OUT
325                             && *var_ipp != orig_dstip
326                             && !do_extra_mangle(*var_ipp, other_ipp)) {
327                                 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
328                                        i, NIPQUAD(*var_ipp));
329                                 /* Can't route?  This whole range part is
330                                  * probably screwed, but keep trying
331                                  * anyway. */
332                                 continue;
333                         }
334 #endif
335
336                         /* Count how many others map onto this. */
337                         score = count_maps(tuple->src.ip, tuple->dst.ip,
338                                            tuple->dst.protonum, conntrack);
339                         if (score < best.score) {
340                                 /* Optimization: doesn't get any better than
341                                    this. */
342                                 if (score == 0)
343                                         return (struct ip_nat_range *)
344                                                 &mr->range[i];
345
346                                 best.score = score;
347                                 best.tuple = *tuple;
348                                 best.range = &mr->range[i];
349                         }
350                 }
351         }
352         *tuple = best.tuple;
353
354         /* Discard const. */
355         return (struct ip_nat_range *)best.range;
356 }
357
358 /* Fast version doesn't iterate through hash chains, but only handles
359    common case of single IP address (null NAT, masquerade) */
360 static struct ip_nat_range *
361 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
362                          const struct ip_nat_multi_range *mr,
363                          const struct ip_conntrack *conntrack,
364                          unsigned int hooknum)
365 {
366         if (mr->rangesize != 1
367             || (mr->range[0].flags & IP_NAT_RANGE_FULL)
368             || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
369                 && mr->range[0].min_ip != mr->range[0].max_ip))
370                 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
371
372         if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
373                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
374                         tuple->src.ip = mr->range[0].min_ip;
375                 else {
376                         /* Only do extra mangle when required (breaks
377                            socket binding) */
378 #ifdef CONFIG_IP_NF_NAT_LOCAL
379                         if (tuple->dst.ip != mr->range[0].min_ip
380                             && hooknum == NF_IP_LOCAL_OUT
381                             && !do_extra_mangle(mr->range[0].min_ip,
382                                                 &tuple->src.ip))
383                                 return NULL;
384 #endif
385                         tuple->dst.ip = mr->range[0].min_ip;
386                 }
387         }
388
389         /* Discard const. */
390         return (struct ip_nat_range *)&mr->range[0];
391 }
392
393 static int
394 get_unique_tuple(struct ip_conntrack_tuple *tuple,
395                  const struct ip_conntrack_tuple *orig_tuple,
396                  const struct ip_nat_multi_range *mrr,
397                  struct ip_conntrack *conntrack,
398                  unsigned int hooknum)
399 {
400         struct ip_nat_protocol *proto
401                 = find_nat_proto(orig_tuple->dst.protonum);
402         struct ip_nat_range *rptr;
403         unsigned int i;
404         int ret;
405
406         /* We temporarily use flags for marking full parts, but we
407            always clean up afterwards */
408         struct ip_nat_multi_range *mr = (void *)mrr;
409
410         /* 1) If this srcip/proto/src-proto-part is currently mapped,
411            and that same mapping gives a unique tuple within the given
412            range, use that.
413
414            This is only required for source (ie. NAT/masq) mappings.
415            So far, we don't do local source mappings, so multiple
416            manips not an issue.  */
417         if (hooknum == NF_IP_POST_ROUTING) {
418                 struct ip_conntrack_manip *manip;
419
420                 manip = find_appropriate_src(orig_tuple, mr);
421                 if (manip) {
422                         /* Apply same source manipulation. */
423                         *tuple = ((struct ip_conntrack_tuple)
424                                   { *manip, orig_tuple->dst });
425                         DEBUGP("get_unique_tuple: Found current src map\n");
426                         return 1;
427                 }
428         }
429
430         /* 2) Select the least-used IP/proto combination in the given
431            range.
432         */
433         *tuple = *orig_tuple;
434         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
435                != NULL) {
436                 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
437                 /* 3) The per-protocol part of the manip is made to
438                    map into the range to make a unique tuple. */
439
440                 /* Only bother mapping if it's not already in range
441                    and unique */
442                 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
443                      || proto->in_range(tuple, HOOK2MANIP(hooknum),
444                                         &rptr->min, &rptr->max))
445                     && !ip_nat_used_tuple(tuple, conntrack)) {
446                         ret = 1;
447                         goto clear_fulls;
448                 } else {
449                         if (proto->unique_tuple(tuple, rptr,
450                                                 HOOK2MANIP(hooknum),
451                                                 conntrack)) {
452                                 /* Must be unique. */
453                                 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
454                                                                 conntrack));
455                                 ret = 1;
456                                 goto clear_fulls;
457                         } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
458                                 /* Try implicit source NAT; protocol
459                                    may be able to play with ports to
460                                    make it unique. */
461                                 struct ip_nat_range r
462                                         = { IP_NAT_RANGE_MAP_IPS, 
463                                             tuple->src.ip, tuple->src.ip,
464                                             { 0 }, { 0 } };
465                                 DEBUGP("Trying implicit mapping\n");
466                                 if (proto->unique_tuple(tuple, &r,
467                                                         IP_NAT_MANIP_SRC,
468                                                         conntrack)) {
469                                         /* Must be unique. */
470                                         IP_NF_ASSERT(!ip_nat_used_tuple
471                                                      (tuple, conntrack));
472                                         ret = 1;
473                                         goto clear_fulls;
474                                 }
475                         }
476                         DEBUGP("Protocol can't get unique tuple %u.\n",
477                                hooknum);
478                 }
479
480                 /* Eliminate that from range, and try again. */
481                 rptr->flags |= IP_NAT_RANGE_FULL;
482                 *tuple = *orig_tuple;
483         }
484
485         ret = 0;
486
487  clear_fulls:
488         /* Clear full flags. */
489         IP_NF_ASSERT(mr->rangesize >= 1);
490         for (i = 0; i < mr->rangesize; i++)
491                 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
492
493         return ret;
494 }
495
496 static inline int
497 helper_cmp(const struct ip_nat_helper *helper,
498            const struct ip_conntrack_tuple *tuple)
499 {
500         return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
501 }
502
503 /* Where to manip the reply packets (will be reverse manip). */
504 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
505 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
506     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
507 #ifdef CONFIG_IP_NF_NAT_LOCAL
508     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
509     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
510 #endif
511 };
512
513 unsigned int
514 ip_nat_setup_info(struct ip_conntrack *conntrack,
515                   const struct ip_nat_multi_range *mr,
516                   unsigned int hooknum)
517 {
518         struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
519         struct ip_conntrack_tuple orig_tp;
520         struct ip_nat_info *info = &conntrack->nat.info;
521
522         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
523         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
524                      || hooknum == NF_IP_POST_ROUTING
525                      || hooknum == NF_IP_LOCAL_OUT);
526         IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
527
528         /* What we've got will look like inverse of reply. Normally
529            this is what is in the conntrack, except for prior
530            manipulations (future optimization: if num_manips == 0,
531            orig_tp =
532            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
533         invert_tuplepr(&orig_tp,
534                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
535
536 #if 0
537         {
538         unsigned int i;
539
540         DEBUGP("Hook %u (%s), ", hooknum,
541                HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
542         DUMP_TUPLE(&orig_tp);
543         DEBUGP("Range %p: ", mr);
544         for (i = 0; i < mr->rangesize; i++) {
545                 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
546                        i,
547                        (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
548                        ? " MAP_IPS" : "",
549                        (mr->range[i].flags
550                         & IP_NAT_RANGE_PROTO_SPECIFIED)
551                        ? " PROTO_SPECIFIED" : "",
552                        (mr->range[i].flags & IP_NAT_RANGE_FULL)
553                        ? " FULL" : "",
554                        NIPQUAD(mr->range[i].min_ip),
555                        NIPQUAD(mr->range[i].max_ip),
556                        mr->range[i].min.all,
557                        mr->range[i].max.all);
558         }
559         }
560 #endif
561
562         do {
563                 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
564                                       hooknum)) {
565                         DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
566                                conntrack);
567                         return NF_DROP;
568                 }
569
570 #if 0
571                 DEBUGP("Hook %u (%s) %p\n", hooknum,
572                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
573                        conntrack);
574                 DEBUGP("Original: ");
575                 DUMP_TUPLE(&orig_tp);
576                 DEBUGP("New: ");
577                 DUMP_TUPLE(&new_tuple);
578 #endif
579
580                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
581                    the original (A/B/C/D') and the mangled one (E/F/G/H').
582
583                    We're only allowed to work with the SRC per-proto
584                    part, so we create inverses of both to start, then
585                    derive the other fields we need.  */
586
587                 /* Reply connection: simply invert the new tuple
588                    (G/H/E/F') */
589                 invert_tuplepr(&reply, &new_tuple);
590
591                 /* Alter conntrack table so it recognizes replies.
592                    If fail this race (reply tuple now used), repeat. */
593         } while (!ip_conntrack_alter_reply(conntrack, &reply));
594
595         /* FIXME: We can simply used existing conntrack reply tuple
596            here --RR */
597         /* Create inverse of original: C/D/A/B' */
598         invert_tuplepr(&inv_tuple, &orig_tp);
599
600         /* Has source changed?. */
601         if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
602                 /* In this direction, a source manip. */
603                 info->manips[info->num_manips++] =
604                         ((struct ip_nat_info_manip)
605                          { IP_CT_DIR_ORIGINAL, hooknum,
606                            IP_NAT_MANIP_SRC, new_tuple.src });
607
608                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
609
610                 /* In the reverse direction, a destination manip. */
611                 info->manips[info->num_manips++] =
612                         ((struct ip_nat_info_manip)
613                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
614                            IP_NAT_MANIP_DST, orig_tp.src });
615                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
616         }
617
618         /* Has destination changed? */
619         if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
620                 /* In this direction, a destination manip */
621                 info->manips[info->num_manips++] =
622                         ((struct ip_nat_info_manip)
623                          { IP_CT_DIR_ORIGINAL, hooknum,
624                            IP_NAT_MANIP_DST, reply.src });
625
626                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
627
628                 /* In the reverse direction, a source manip. */
629                 info->manips[info->num_manips++] =
630                         ((struct ip_nat_info_manip)
631                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
632                            IP_NAT_MANIP_SRC, inv_tuple.src });
633                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
634         }
635
636         /* If there's a helper, assign it; based on new tuple. */
637         if (!conntrack->master)
638                 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
639                                          &reply);
640
641         /* It's done. */
642         info->initialized |= (1 << HOOK2MANIP(hooknum));
643         return NF_ACCEPT;
644 }
645
646 void replace_in_hashes(struct ip_conntrack *conntrack,
647                        struct ip_nat_info *info)
648 {
649         /* Source has changed, so replace in hashes. */
650         unsigned int srchash
651                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
652                               .tuple.src,
653                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
654                               .tuple.dst.protonum);
655         /* We place packet as seen OUTGOUNG in byips_proto hash
656            (ie. reverse dst and src of reply packet. */
657         unsigned int ipsprotohash
658                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
659                                    .tuple.dst.ip,
660                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
661                                    .tuple.src.ip,
662                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
663                                    .tuple.dst.protonum);
664
665         IP_NF_ASSERT(info->bysource.conntrack == conntrack);
666         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
667
668         list_del(&info->bysource.list);
669         list_del(&info->byipsproto.list);
670
671         list_prepend(&bysource[srchash], &info->bysource);
672         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
673 }
674
675 void place_in_hashes(struct ip_conntrack *conntrack,
676                      struct ip_nat_info *info)
677 {
678         unsigned int srchash
679                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
680                               .tuple.src,
681                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
682                               .tuple.dst.protonum);
683         /* We place packet as seen OUTGOUNG in byips_proto hash
684            (ie. reverse dst and src of reply packet. */
685         unsigned int ipsprotohash
686                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
687                                    .tuple.dst.ip,
688                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
689                                    .tuple.src.ip,
690                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
691                                    .tuple.dst.protonum);
692
693         IP_NF_ASSERT(!info->bysource.conntrack);
694
695         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
696         info->byipsproto.conntrack = conntrack;
697         info->bysource.conntrack = conntrack;
698
699         list_prepend(&bysource[srchash], &info->bysource);
700         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
701 }
702
703 /* Returns true if succeeded. */
704 static int
705 manip_pkt(u_int16_t proto,
706           struct sk_buff **pskb,
707           unsigned int iphdroff,
708           const struct ip_conntrack_manip *manip,
709           enum ip_nat_manip_type maniptype)
710 {
711         struct iphdr *iph;
712
713         (*pskb)->nfcache |= NFC_ALTERED;
714         if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
715                 return 0;
716
717         iph = (void *)(*pskb)->data + iphdroff;
718
719         /* Manipulate protcol part. */
720         if (!find_nat_proto(proto)->manip_pkt(pskb,
721                                               iphdroff + iph->ihl*4,
722                                               manip, maniptype))
723                 return 0;
724
725         iph = (void *)(*pskb)->data + iphdroff;
726
727         if (maniptype == IP_NAT_MANIP_SRC) {
728                 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
729                                                 iph->check);
730                 iph->saddr = manip->ip;
731         } else {
732                 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
733                                                 iph->check);
734                 iph->daddr = manip->ip;
735         }
736         return 1;
737 }
738
739 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
740                                  struct sk_buff *skb)
741 {
742         struct ip_conntrack_protocol *proto;
743         int ret = 1;
744
745         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
746         proto = __ip_ct_find_proto(skb->nh.iph->protocol);
747         if (proto->exp_matches_pkt)
748                 ret = proto->exp_matches_pkt(exp, skb);
749
750         return ret;
751 }
752
753 /* Do packet manipulations according to binding. */
754 unsigned int
755 do_bindings(struct ip_conntrack *ct,
756             enum ip_conntrack_info ctinfo,
757             struct ip_nat_info *info,
758             unsigned int hooknum,
759             struct sk_buff **pskb)
760 {
761         unsigned int i;
762         struct ip_nat_helper *helper;
763         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
764         int proto = (*pskb)->nh.iph->protocol;
765
766         /* Skip everything and don't call helpers if there are no
767          * manips for this connection */
768         if (info->num_manips == 0)
769                 return NF_ACCEPT;
770
771         /* Need nat lock to protect against modification, but neither
772            conntrack (referenced) and helper (deleted with
773            synchronize_bh()) can vanish. */
774         READ_LOCK(&ip_nat_lock);
775         for (i = 0; i < info->num_manips; i++) {
776                 if (info->manips[i].direction == dir
777                     && info->manips[i].hooknum == hooknum) {
778                         DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
779                                *pskb,
780                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
781                                ? "SRC" : "DST",
782                                NIPQUAD(info->manips[i].manip.ip),
783                                htons(info->manips[i].manip.u.all));
784                         if (!manip_pkt(proto, pskb, 0,
785                                        &info->manips[i].manip,
786                                        info->manips[i].maniptype)) {
787                                 READ_UNLOCK(&ip_nat_lock);
788                                 return NF_DROP;
789                         }
790                 }
791         }
792         helper = info->helper;
793         READ_UNLOCK(&ip_nat_lock);
794
795         if (helper) {
796                 struct ip_conntrack_expect *exp = NULL;
797                 struct list_head *cur_item;
798                 int ret = NF_ACCEPT;
799                 int helper_called = 0;
800
801                 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
802
803                 /* Always defragged for helpers */
804                 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
805                                & htons(IP_MF|IP_OFFSET)));
806
807                 /* Have to grab read lock before sibling_list traversal */
808                 READ_LOCK(&ip_conntrack_lock);
809                 list_for_each(cur_item, &ct->sibling_list) { 
810                         exp = list_entry(cur_item, struct ip_conntrack_expect, 
811                                          expected_list);
812                                          
813                         /* if this expectation is already established, skip */
814                         if (exp->sibling)
815                                 continue;
816
817                         if (exp_for_packet(exp, *pskb)) {
818                                 /* FIXME: May be true multiple times in the
819                                  * case of UDP!! */
820                                 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
821                                 ret = helper->help(ct, exp, info, ctinfo, 
822                                                    hooknum, pskb);
823                                 if (ret != NF_ACCEPT) {
824                                         READ_UNLOCK(&ip_conntrack_lock);
825                                         return ret;
826                                 }
827                                 helper_called = 1;
828                         }
829                 }
830                 /* Helper might want to manip the packet even when there is no
831                  * matching expectation for this packet */
832                 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
833                         DEBUGP("calling nat helper for packet without expectation\n");
834                         ret = helper->help(ct, NULL, info, ctinfo, 
835                                            hooknum, pskb);
836                         if (ret != NF_ACCEPT) {
837                                 READ_UNLOCK(&ip_conntrack_lock);
838                                 return ret;
839                         }
840                 }
841                 READ_UNLOCK(&ip_conntrack_lock);
842                 
843                 /* Adjust sequence number only once per packet 
844                  * (helper is called at all hooks) */
845                 if (proto == IPPROTO_TCP
846                     && (hooknum == NF_IP_POST_ROUTING
847                         || hooknum == NF_IP_LOCAL_IN)) {
848                         DEBUGP("ip_nat_core: adjusting sequence number\n");
849                         /* future: put this in a l4-proto specific function,
850                          * and call this function here. */
851                         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
852                                 ret = NF_DROP;
853                 }
854
855                 return ret;
856
857         } else 
858                 return NF_ACCEPT;
859
860         /* not reached */
861 }
862
863 int
864 icmp_reply_translation(struct sk_buff **pskb,
865                        struct ip_conntrack *conntrack,
866                        unsigned int hooknum,
867                        int dir)
868 {
869         struct {
870                 struct icmphdr icmp;
871                 struct iphdr ip;
872         } *inside;
873         unsigned int i;
874         struct ip_nat_info *info = &conntrack->nat.info;
875         int hdrlen;
876
877         if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
878                 return 0;
879         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
880
881         /* We're actually going to mangle it beyond trivial checksum
882            adjustment, so make sure the current checksum is correct. */
883         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
884                 hdrlen = (*pskb)->nh.iph->ihl * 4;
885                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
886                                                 (*pskb)->len - hdrlen, 0)))
887                         return 0;
888         }
889
890         /* Must be RELATED */
891         IP_NF_ASSERT((*pskb)->nfct
892                      - (struct ip_conntrack *)(*pskb)->nfct->master
893                      == IP_CT_RELATED
894                      || (*pskb)->nfct
895                      - (struct ip_conntrack *)(*pskb)->nfct->master
896                      == IP_CT_RELATED+IP_CT_IS_REPLY);
897
898         /* Redirects on non-null nats must be dropped, else they'll
899            start talking to each other without our translation, and be
900            confused... --RR */
901         if (inside->icmp.type == ICMP_REDIRECT) {
902                 /* Don't care about races here. */
903                 if (info->initialized
904                     != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
905                     || info->num_manips != 0)
906                         return 0;
907         }
908
909         DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
910                *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
911         /* Note: May not be from a NAT'd host, but probably safest to
912            do translation always as if it came from the host itself
913            (even though a "host unreachable" coming from the host
914            itself is a bit weird).
915
916            More explanation: some people use NAT for anonymizing.
917            Also, CERT recommends dropping all packets from private IP
918            addresses (although ICMP errors from internal links with
919            such addresses are not too uncommon, as Alan Cox points
920            out) */
921
922         READ_LOCK(&ip_nat_lock);
923         for (i = 0; i < info->num_manips; i++) {
924                 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
925                        i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
926                        "ORIG" : "REPLY", info->manips[i].hooknum);
927
928                 if (info->manips[i].direction != dir)
929                         continue;
930
931                 /* Mapping the inner packet is just like a normal
932                    packet, except it was never src/dst reversed, so
933                    where we would normally apply a dst manip, we apply
934                    a src, and vice versa. */
935                 if (info->manips[i].hooknum == hooknum) {
936                         DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
937                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
938                                ? "DST" : "SRC",
939                                NIPQUAD(info->manips[i].manip.ip),
940                                ntohs(info->manips[i].manip.u.udp.port));
941                         if (!manip_pkt(inside->ip.protocol, pskb,
942                                        (*pskb)->nh.iph->ihl*4
943                                        + sizeof(inside->icmp),
944                                        &info->manips[i].manip,
945                                        !info->manips[i].maniptype))
946                                 goto unlock_fail;
947
948                         /* Outer packet needs to have IP header NATed like
949                            it's a reply. */
950
951                         /* Use mapping to map outer packet: 0 give no
952                            per-proto mapping */
953                         DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
954                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
955                                ? "SRC" : "DST",
956                                NIPQUAD(info->manips[i].manip.ip));
957                         if (!manip_pkt(0, pskb, 0,
958                                        &info->manips[i].manip,
959                                        info->manips[i].maniptype))
960                                 goto unlock_fail;
961                 }
962         }
963         READ_UNLOCK(&ip_nat_lock);
964
965         hdrlen = (*pskb)->nh.iph->ihl * 4;
966
967         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
968
969         inside->icmp.checksum = 0;
970         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
971                                                        (*pskb)->len - hdrlen,
972                                                        0));
973         return 1;
974
975  unlock_fail:
976         READ_UNLOCK(&ip_nat_lock);
977         return 0;
978 }
979
980 int __init ip_nat_init(void)
981 {
982         size_t i;
983
984         /* Leave them the same for the moment. */
985         ip_nat_htable_size = ip_conntrack_htable_size;
986
987         /* One vmalloc for both hash tables */
988         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
989         if (!bysource) {
990                 return -ENOMEM;
991         }
992         byipsproto = bysource + ip_nat_htable_size;
993
994         /* Sew in builtin protocols. */
995         WRITE_LOCK(&ip_nat_lock);
996         list_append(&protos, &ip_nat_protocol_tcp);
997         list_append(&protos, &ip_nat_protocol_udp);
998         list_append(&protos, &ip_nat_protocol_icmp);
999         WRITE_UNLOCK(&ip_nat_lock);
1000
1001         for (i = 0; i < ip_nat_htable_size; i++) {
1002                 INIT_LIST_HEAD(&bysource[i]);
1003                 INIT_LIST_HEAD(&byipsproto[i]);
1004         }
1005
1006         /* FIXME: Man, this is a hack.  <SIGH> */
1007         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1008         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1009
1010         return 0;
1011 }
1012
1013 /* Clear NAT section of all conntracks, in case we're loaded again. */
1014 static int clean_nat(const struct ip_conntrack *i, void *data)
1015 {
1016         memset((void *)&i->nat, 0, sizeof(i->nat));
1017         return 0;
1018 }
1019
1020 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1021 void ip_nat_cleanup(void)
1022 {
1023         ip_ct_selective_cleanup(&clean_nat, NULL);
1024         ip_conntrack_destroyed = NULL;
1025         vfree(bysource);
1026 }