1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
5 #include <linux/version.h>
6 #include <linux/module.h>
7 #include <linux/types.h>
8 #include <linux/timer.h>
9 #include <linux/skbuff.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <linux/vmalloc.h>
12 #include <net/checksum.h>
15 #include <net/tcp.h> /* For tcp_prot in getorigdst */
16 #include <linux/icmp.h>
17 #include <linux/udp.h>
19 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
20 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
22 #include <linux/netfilter_ipv4/ip_conntrack.h>
23 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
24 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
25 #include <linux/netfilter_ipv4/ip_nat.h>
26 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
27 #include <linux/netfilter_ipv4/ip_nat_core.h>
28 #include <linux/netfilter_ipv4/ip_nat_helper.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
30 #include <linux/netfilter_ipv4/listhelp.h>
35 #define DEBUGP(format, args...)
38 DECLARE_RWLOCK(ip_nat_lock);
39 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
41 /* Calculated at init based on memory size */
42 static unsigned int ip_nat_htable_size;
44 static struct list_head *bysource;
45 static struct list_head *byipsproto;
49 extern struct ip_nat_protocol unknown_nat_protocol;
51 /* We keep extra hashes for each conntrack, for fast searching. */
53 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
55 /* Modified src and dst, to ensure we don't create two
57 return (src + dst + proto) % ip_nat_htable_size;
61 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
63 /* Original src, to ensure we map it consistently if poss. */
64 return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
67 /* Noone using conntrack by the time this called. */
68 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
70 struct ip_nat_info *info = &conn->nat.info;
72 if (!info->initialized)
75 IP_NF_ASSERT(info->bysource.conntrack);
76 IP_NF_ASSERT(info->byipsproto.conntrack);
78 WRITE_LOCK(&ip_nat_lock);
79 LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
81 conn->tuplehash[IP_CT_DIR_ORIGINAL]
82 .tuple.dst.protonum)],
85 LIST_DELETE(&byipsproto
86 [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
88 conn->tuplehash[IP_CT_DIR_REPLY]
90 conn->tuplehash[IP_CT_DIR_REPLY]
91 .tuple.dst.protonum)],
93 WRITE_UNLOCK(&ip_nat_lock);
96 /* We do checksum mangling, so if they were wrong before they're still
97 * wrong. Also works for incomplete packets (eg. ICMP dest
100 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
102 u_int32_t diffs[] = { oldvalinv, newval };
103 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
107 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
109 return i->protonum == proto;
112 struct ip_nat_protocol *
113 find_nat_proto(u_int16_t protonum)
115 struct ip_nat_protocol *i;
117 MUST_BE_READ_LOCKED(&ip_nat_lock);
118 i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
120 i = &unknown_nat_protocol;
124 /* Is this tuple already taken? (not by us) */
126 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
127 const struct ip_conntrack *ignored_conntrack)
129 /* Conntrack tracking doesn't keep track of outgoing tuples; only
130 incoming ones. NAT means they don't have a fixed mapping,
131 so we invert the tuple and look for the incoming reply.
133 We could keep a separate hash if this proves too slow. */
134 struct ip_conntrack_tuple reply;
136 invert_tuplepr(&reply, tuple);
137 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
140 /* Does tuple + the source manip come within the range mr */
142 in_range(const struct ip_conntrack_tuple *tuple,
143 const struct ip_conntrack_manip *manip,
144 const struct ip_nat_multi_range *mr)
146 struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
148 struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
150 for (i = 0; i < mr->rangesize; i++) {
151 /* If we are allowed to map IPs, then we must be in the
152 range specified, otherwise we must be unchanged. */
153 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
154 if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
155 || (ntohl(newtuple.src.ip)
156 > ntohl(mr->range[i].max_ip)))
159 if (newtuple.src.ip != tuple->src.ip)
163 if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
164 && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
165 &mr->range[i].min, &mr->range[i].max))
172 src_cmp(const struct ip_nat_hash *i,
173 const struct ip_conntrack_tuple *tuple,
174 const struct ip_nat_multi_range *mr)
176 return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
177 == tuple->dst.protonum
178 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
180 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
183 &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
188 /* Only called for SRC manip */
189 static struct ip_conntrack_manip *
190 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
191 const struct ip_nat_multi_range *mr)
193 unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
194 struct ip_nat_hash *i;
196 MUST_BE_READ_LOCKED(&ip_nat_lock);
197 i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
199 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
204 #ifdef CONFIG_IP_NF_NAT_LOCAL
205 /* If it's really a local destination manip, it may need to do a
208 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
210 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
213 /* FIXME: IPTOS_TOS(iph->tos) --RR */
214 if (ip_route_output_key(&rt, &fl) != 0) {
215 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
220 *other_ipp = rt->rt_src;
226 /* Simple way to iterate through all. */
227 static inline int fake_cmp(const struct ip_nat_hash *i,
228 u_int32_t src, u_int32_t dst, u_int16_t protonum,
230 const struct ip_conntrack *conntrack)
232 /* Compare backwards: we're dealing with OUTGOING tuples, and
233 inside the conntrack is the REPLY tuple. Don't count this
235 if (i->conntrack != conntrack
236 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
237 && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
238 && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
244 static inline unsigned int
245 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
246 const struct ip_conntrack *conntrack)
248 unsigned int score = 0;
250 MUST_BE_READ_LOCKED(&ip_nat_lock);
251 LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
252 fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
258 /* For [FUTURE] fragmentation handling, we want the least-used
259 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
260 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
261 1-65535, we don't do pro-rata allocation based on ports; we choose
262 the ip with the lowest src-ip/dst-ip/proto usage.
264 If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
265 range), we eliminate that and try again. This is not the most
266 efficient approach, but if you're worried about that, don't hand us
267 ranges you don't really have. */
268 static struct ip_nat_range *
269 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
270 const struct ip_nat_multi_range *mr,
271 const struct ip_conntrack *conntrack,
272 unsigned int hooknum)
276 const struct ip_nat_range *range;
278 struct ip_conntrack_tuple tuple;
279 } best = { NULL, 0xFFFFFFFF };
280 u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
281 static unsigned int randomness;
283 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
284 var_ipp = &tuple->src.ip;
285 saved_ip = tuple->dst.ip;
286 other_ipp = &tuple->dst.ip;
288 var_ipp = &tuple->dst.ip;
289 saved_ip = tuple->src.ip;
290 other_ipp = &tuple->src.ip;
292 /* Don't do do_extra_mangle unless necessary (overrides
293 explicit socket bindings, for example) */
294 orig_dstip = tuple->dst.ip;
296 IP_NF_ASSERT(mr->rangesize >= 1);
297 for (i = 0; i < mr->rangesize; i++) {
299 u_int32_t minip, maxip, j;
301 /* Don't do ranges which are already eliminated. */
302 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
306 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
307 minip = ntohl(mr->range[i].min_ip);
308 maxip = ntohl(mr->range[i].max_ip);
310 minip = maxip = ntohl(*var_ipp);
313 for (j = 0; j < maxip - minip + 1; j++) {
316 *var_ipp = htonl(minip + (randomness + j)
317 % (maxip - minip + 1));
319 /* Reset the other ip in case it was mangled by
320 * do_extra_mangle last time. */
321 *other_ipp = saved_ip;
323 #ifdef CONFIG_IP_NF_NAT_LOCAL
324 if (hooknum == NF_IP_LOCAL_OUT
325 && *var_ipp != orig_dstip
326 && !do_extra_mangle(*var_ipp, other_ipp)) {
327 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
328 i, NIPQUAD(*var_ipp));
329 /* Can't route? This whole range part is
330 * probably screwed, but keep trying
336 /* Count how many others map onto this. */
337 score = count_maps(tuple->src.ip, tuple->dst.ip,
338 tuple->dst.protonum, conntrack);
339 if (score < best.score) {
340 /* Optimization: doesn't get any better than
343 return (struct ip_nat_range *)
348 best.range = &mr->range[i];
355 return (struct ip_nat_range *)best.range;
358 /* Fast version doesn't iterate through hash chains, but only handles
359 common case of single IP address (null NAT, masquerade) */
360 static struct ip_nat_range *
361 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
362 const struct ip_nat_multi_range *mr,
363 const struct ip_conntrack *conntrack,
364 unsigned int hooknum)
366 if (mr->rangesize != 1
367 || (mr->range[0].flags & IP_NAT_RANGE_FULL)
368 || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
369 && mr->range[0].min_ip != mr->range[0].max_ip))
370 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
372 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
373 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
374 tuple->src.ip = mr->range[0].min_ip;
376 /* Only do extra mangle when required (breaks
378 #ifdef CONFIG_IP_NF_NAT_LOCAL
379 if (tuple->dst.ip != mr->range[0].min_ip
380 && hooknum == NF_IP_LOCAL_OUT
381 && !do_extra_mangle(mr->range[0].min_ip,
385 tuple->dst.ip = mr->range[0].min_ip;
390 return (struct ip_nat_range *)&mr->range[0];
394 get_unique_tuple(struct ip_conntrack_tuple *tuple,
395 const struct ip_conntrack_tuple *orig_tuple,
396 const struct ip_nat_multi_range *mrr,
397 struct ip_conntrack *conntrack,
398 unsigned int hooknum)
400 struct ip_nat_protocol *proto
401 = find_nat_proto(orig_tuple->dst.protonum);
402 struct ip_nat_range *rptr;
406 /* We temporarily use flags for marking full parts, but we
407 always clean up afterwards */
408 struct ip_nat_multi_range *mr = (void *)mrr;
410 /* 1) If this srcip/proto/src-proto-part is currently mapped,
411 and that same mapping gives a unique tuple within the given
414 This is only required for source (ie. NAT/masq) mappings.
415 So far, we don't do local source mappings, so multiple
416 manips not an issue. */
417 if (hooknum == NF_IP_POST_ROUTING) {
418 struct ip_conntrack_manip *manip;
420 manip = find_appropriate_src(orig_tuple, mr);
422 /* Apply same source manipulation. */
423 *tuple = ((struct ip_conntrack_tuple)
424 { *manip, orig_tuple->dst });
425 DEBUGP("get_unique_tuple: Found current src map\n");
430 /* 2) Select the least-used IP/proto combination in the given
433 *tuple = *orig_tuple;
434 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
436 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
437 /* 3) The per-protocol part of the manip is made to
438 map into the range to make a unique tuple. */
440 /* Only bother mapping if it's not already in range
442 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
443 || proto->in_range(tuple, HOOK2MANIP(hooknum),
444 &rptr->min, &rptr->max))
445 && !ip_nat_used_tuple(tuple, conntrack)) {
449 if (proto->unique_tuple(tuple, rptr,
452 /* Must be unique. */
453 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
457 } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
458 /* Try implicit source NAT; protocol
459 may be able to play with ports to
461 struct ip_nat_range r
462 = { IP_NAT_RANGE_MAP_IPS,
463 tuple->src.ip, tuple->src.ip,
465 DEBUGP("Trying implicit mapping\n");
466 if (proto->unique_tuple(tuple, &r,
469 /* Must be unique. */
470 IP_NF_ASSERT(!ip_nat_used_tuple
476 DEBUGP("Protocol can't get unique tuple %u.\n",
480 /* Eliminate that from range, and try again. */
481 rptr->flags |= IP_NAT_RANGE_FULL;
482 *tuple = *orig_tuple;
488 /* Clear full flags. */
489 IP_NF_ASSERT(mr->rangesize >= 1);
490 for (i = 0; i < mr->rangesize; i++)
491 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
497 helper_cmp(const struct ip_nat_helper *helper,
498 const struct ip_conntrack_tuple *tuple)
500 return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
503 /* Where to manip the reply packets (will be reverse manip). */
504 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
505 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
506 [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
507 #ifdef CONFIG_IP_NF_NAT_LOCAL
508 [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
509 [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
514 ip_nat_setup_info(struct ip_conntrack *conntrack,
515 const struct ip_nat_multi_range *mr,
516 unsigned int hooknum)
518 struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
519 struct ip_conntrack_tuple orig_tp;
520 struct ip_nat_info *info = &conntrack->nat.info;
522 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
523 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
524 || hooknum == NF_IP_POST_ROUTING
525 || hooknum == NF_IP_LOCAL_OUT);
526 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
528 /* What we've got will look like inverse of reply. Normally
529 this is what is in the conntrack, except for prior
530 manipulations (future optimization: if num_manips == 0,
532 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
533 invert_tuplepr(&orig_tp,
534 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
540 DEBUGP("Hook %u (%s), ", hooknum,
541 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
542 DUMP_TUPLE(&orig_tp);
543 DEBUGP("Range %p: ", mr);
544 for (i = 0; i < mr->rangesize; i++) {
545 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
547 (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
550 & IP_NAT_RANGE_PROTO_SPECIFIED)
551 ? " PROTO_SPECIFIED" : "",
552 (mr->range[i].flags & IP_NAT_RANGE_FULL)
554 NIPQUAD(mr->range[i].min_ip),
555 NIPQUAD(mr->range[i].max_ip),
556 mr->range[i].min.all,
557 mr->range[i].max.all);
563 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
565 DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
571 DEBUGP("Hook %u (%s) %p\n", hooknum,
572 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
574 DEBUGP("Original: ");
575 DUMP_TUPLE(&orig_tp);
577 DUMP_TUPLE(&new_tuple);
580 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
581 the original (A/B/C/D') and the mangled one (E/F/G/H').
583 We're only allowed to work with the SRC per-proto
584 part, so we create inverses of both to start, then
585 derive the other fields we need. */
587 /* Reply connection: simply invert the new tuple
589 invert_tuplepr(&reply, &new_tuple);
591 /* Alter conntrack table so it recognizes replies.
592 If fail this race (reply tuple now used), repeat. */
593 } while (!ip_conntrack_alter_reply(conntrack, &reply));
595 /* FIXME: We can simply used existing conntrack reply tuple
597 /* Create inverse of original: C/D/A/B' */
598 invert_tuplepr(&inv_tuple, &orig_tp);
600 /* Has source changed?. */
601 if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
602 /* In this direction, a source manip. */
603 info->manips[info->num_manips++] =
604 ((struct ip_nat_info_manip)
605 { IP_CT_DIR_ORIGINAL, hooknum,
606 IP_NAT_MANIP_SRC, new_tuple.src });
608 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
610 /* In the reverse direction, a destination manip. */
611 info->manips[info->num_manips++] =
612 ((struct ip_nat_info_manip)
613 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
614 IP_NAT_MANIP_DST, orig_tp.src });
615 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
618 /* Has destination changed? */
619 if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
620 /* In this direction, a destination manip */
621 info->manips[info->num_manips++] =
622 ((struct ip_nat_info_manip)
623 { IP_CT_DIR_ORIGINAL, hooknum,
624 IP_NAT_MANIP_DST, reply.src });
626 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
628 /* In the reverse direction, a source manip. */
629 info->manips[info->num_manips++] =
630 ((struct ip_nat_info_manip)
631 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
632 IP_NAT_MANIP_SRC, inv_tuple.src });
633 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
636 /* If there's a helper, assign it; based on new tuple. */
637 if (!conntrack->master)
638 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
642 info->initialized |= (1 << HOOK2MANIP(hooknum));
646 void replace_in_hashes(struct ip_conntrack *conntrack,
647 struct ip_nat_info *info)
649 /* Source has changed, so replace in hashes. */
651 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
653 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
654 .tuple.dst.protonum);
655 /* We place packet as seen OUTGOUNG in byips_proto hash
656 (ie. reverse dst and src of reply packet. */
657 unsigned int ipsprotohash
658 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
660 conntrack->tuplehash[IP_CT_DIR_REPLY]
662 conntrack->tuplehash[IP_CT_DIR_REPLY]
663 .tuple.dst.protonum);
665 IP_NF_ASSERT(info->bysource.conntrack == conntrack);
666 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
668 list_del(&info->bysource.list);
669 list_del(&info->byipsproto.list);
671 list_prepend(&bysource[srchash], &info->bysource);
672 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
675 void place_in_hashes(struct ip_conntrack *conntrack,
676 struct ip_nat_info *info)
679 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
681 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
682 .tuple.dst.protonum);
683 /* We place packet as seen OUTGOUNG in byips_proto hash
684 (ie. reverse dst and src of reply packet. */
685 unsigned int ipsprotohash
686 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
688 conntrack->tuplehash[IP_CT_DIR_REPLY]
690 conntrack->tuplehash[IP_CT_DIR_REPLY]
691 .tuple.dst.protonum);
693 IP_NF_ASSERT(!info->bysource.conntrack);
695 MUST_BE_WRITE_LOCKED(&ip_nat_lock);
696 info->byipsproto.conntrack = conntrack;
697 info->bysource.conntrack = conntrack;
699 list_prepend(&bysource[srchash], &info->bysource);
700 list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
703 /* Returns true if succeeded. */
705 manip_pkt(u_int16_t proto,
706 struct sk_buff **pskb,
707 unsigned int iphdroff,
708 const struct ip_conntrack_manip *manip,
709 enum ip_nat_manip_type maniptype)
713 (*pskb)->nfcache |= NFC_ALTERED;
714 if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
717 iph = (void *)(*pskb)->data + iphdroff;
719 /* Manipulate protcol part. */
720 if (!find_nat_proto(proto)->manip_pkt(pskb,
721 iphdroff + iph->ihl*4,
725 iph = (void *)(*pskb)->data + iphdroff;
727 if (maniptype == IP_NAT_MANIP_SRC) {
728 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
730 iph->saddr = manip->ip;
732 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
734 iph->daddr = manip->ip;
739 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
742 struct ip_conntrack_protocol *proto;
745 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
746 proto = __ip_ct_find_proto(skb->nh.iph->protocol);
747 if (proto->exp_matches_pkt)
748 ret = proto->exp_matches_pkt(exp, skb);
753 /* Do packet manipulations according to binding. */
755 do_bindings(struct ip_conntrack *ct,
756 enum ip_conntrack_info ctinfo,
757 struct ip_nat_info *info,
758 unsigned int hooknum,
759 struct sk_buff **pskb)
762 struct ip_nat_helper *helper;
763 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
764 int proto = (*pskb)->nh.iph->protocol;
766 /* Skip everything and don't call helpers if there are no
767 * manips for this connection */
768 if (info->num_manips == 0)
771 /* Need nat lock to protect against modification, but neither
772 conntrack (referenced) and helper (deleted with
773 synchronize_bh()) can vanish. */
774 READ_LOCK(&ip_nat_lock);
775 for (i = 0; i < info->num_manips; i++) {
776 if (info->manips[i].direction == dir
777 && info->manips[i].hooknum == hooknum) {
778 DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
780 info->manips[i].maniptype == IP_NAT_MANIP_SRC
782 NIPQUAD(info->manips[i].manip.ip),
783 htons(info->manips[i].manip.u.all));
784 if (!manip_pkt(proto, pskb, 0,
785 &info->manips[i].manip,
786 info->manips[i].maniptype)) {
787 READ_UNLOCK(&ip_nat_lock);
792 helper = info->helper;
793 READ_UNLOCK(&ip_nat_lock);
796 struct ip_conntrack_expect *exp = NULL;
797 struct list_head *cur_item;
799 int helper_called = 0;
801 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
803 /* Always defragged for helpers */
804 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
805 & htons(IP_MF|IP_OFFSET)));
807 /* Have to grab read lock before sibling_list traversal */
808 READ_LOCK(&ip_conntrack_lock);
809 list_for_each(cur_item, &ct->sibling_list) {
810 exp = list_entry(cur_item, struct ip_conntrack_expect,
813 /* if this expectation is already established, skip */
817 if (exp_for_packet(exp, *pskb)) {
818 /* FIXME: May be true multiple times in the
820 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
821 ret = helper->help(ct, exp, info, ctinfo,
823 if (ret != NF_ACCEPT) {
824 READ_UNLOCK(&ip_conntrack_lock);
830 /* Helper might want to manip the packet even when there is no
831 * matching expectation for this packet */
832 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
833 DEBUGP("calling nat helper for packet without expectation\n");
834 ret = helper->help(ct, NULL, info, ctinfo,
836 if (ret != NF_ACCEPT) {
837 READ_UNLOCK(&ip_conntrack_lock);
841 READ_UNLOCK(&ip_conntrack_lock);
843 /* Adjust sequence number only once per packet
844 * (helper is called at all hooks) */
845 if (proto == IPPROTO_TCP
846 && (hooknum == NF_IP_POST_ROUTING
847 || hooknum == NF_IP_LOCAL_IN)) {
848 DEBUGP("ip_nat_core: adjusting sequence number\n");
849 /* future: put this in a l4-proto specific function,
850 * and call this function here. */
851 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
864 icmp_reply_translation(struct sk_buff **pskb,
865 struct ip_conntrack *conntrack,
866 unsigned int hooknum,
874 struct ip_nat_info *info = &conntrack->nat.info;
877 if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
879 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
881 /* We're actually going to mangle it beyond trivial checksum
882 adjustment, so make sure the current checksum is correct. */
883 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
884 hdrlen = (*pskb)->nh.iph->ihl * 4;
885 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
886 (*pskb)->len - hdrlen, 0)))
890 /* Must be RELATED */
891 IP_NF_ASSERT((*pskb)->nfct
892 - (struct ip_conntrack *)(*pskb)->nfct->master
895 - (struct ip_conntrack *)(*pskb)->nfct->master
896 == IP_CT_RELATED+IP_CT_IS_REPLY);
898 /* Redirects on non-null nats must be dropped, else they'll
899 start talking to each other without our translation, and be
901 if (inside->icmp.type == ICMP_REDIRECT) {
902 /* Don't care about races here. */
903 if (info->initialized
904 != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
905 || info->num_manips != 0)
909 DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
910 *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
911 /* Note: May not be from a NAT'd host, but probably safest to
912 do translation always as if it came from the host itself
913 (even though a "host unreachable" coming from the host
914 itself is a bit weird).
916 More explanation: some people use NAT for anonymizing.
917 Also, CERT recommends dropping all packets from private IP
918 addresses (although ICMP errors from internal links with
919 such addresses are not too uncommon, as Alan Cox points
922 READ_LOCK(&ip_nat_lock);
923 for (i = 0; i < info->num_manips; i++) {
924 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
925 i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
926 "ORIG" : "REPLY", info->manips[i].hooknum);
928 if (info->manips[i].direction != dir)
931 /* Mapping the inner packet is just like a normal
932 packet, except it was never src/dst reversed, so
933 where we would normally apply a dst manip, we apply
934 a src, and vice versa. */
935 if (info->manips[i].hooknum == hooknum) {
936 DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
937 info->manips[i].maniptype == IP_NAT_MANIP_SRC
939 NIPQUAD(info->manips[i].manip.ip),
940 ntohs(info->manips[i].manip.u.udp.port));
941 if (!manip_pkt(inside->ip.protocol, pskb,
942 (*pskb)->nh.iph->ihl*4
943 + sizeof(inside->icmp),
944 &info->manips[i].manip,
945 !info->manips[i].maniptype))
948 /* Outer packet needs to have IP header NATed like
951 /* Use mapping to map outer packet: 0 give no
953 DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
954 info->manips[i].maniptype == IP_NAT_MANIP_SRC
956 NIPQUAD(info->manips[i].manip.ip));
957 if (!manip_pkt(0, pskb, 0,
958 &info->manips[i].manip,
959 info->manips[i].maniptype))
963 READ_UNLOCK(&ip_nat_lock);
965 hdrlen = (*pskb)->nh.iph->ihl * 4;
967 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
969 inside->icmp.checksum = 0;
970 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
971 (*pskb)->len - hdrlen,
976 READ_UNLOCK(&ip_nat_lock);
980 int __init ip_nat_init(void)
984 /* Leave them the same for the moment. */
985 ip_nat_htable_size = ip_conntrack_htable_size;
987 /* One vmalloc for both hash tables */
988 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
992 byipsproto = bysource + ip_nat_htable_size;
994 /* Sew in builtin protocols. */
995 WRITE_LOCK(&ip_nat_lock);
996 list_append(&protos, &ip_nat_protocol_tcp);
997 list_append(&protos, &ip_nat_protocol_udp);
998 list_append(&protos, &ip_nat_protocol_icmp);
999 WRITE_UNLOCK(&ip_nat_lock);
1001 for (i = 0; i < ip_nat_htable_size; i++) {
1002 INIT_LIST_HEAD(&bysource[i]);
1003 INIT_LIST_HEAD(&byipsproto[i]);
1006 /* FIXME: Man, this is a hack. <SIGH> */
1007 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1008 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1013 /* Clear NAT section of all conntracks, in case we're loaded again. */
1014 static int clean_nat(const struct ip_conntrack *i, void *data)
1016 memset((void *)&i->nat, 0, sizeof(i->nat));
1020 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1021 void ip_nat_cleanup(void)
1023 ip_ct_selective_cleanup(&clean_nat, NULL);
1024 ip_conntrack_destroyed = NULL;