ef8641f7af8300efae329a3a74cb4325eba1761f
[linux-flexiantxendom0-3.2.10.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
1 /*
2  * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
13  *
14  *              Network name space (netns) aware.
15  *              Global data moved to netns i.e struct netns_ipvs
16  *              tcp_timeouts table has copy per netns in a hash table per
17  *              protocol ip_vs_proto_data and is handled by netns
18  */
19
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h>                  /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/ip_vs.h>
33
34 static int
35 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
36                   int *verdict, struct ip_vs_conn **cpp)
37 {
38         struct net *net;
39         struct ip_vs_service *svc;
40         struct tcphdr _tcph, *th;
41         struct ip_vs_iphdr iph;
42
43         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
44
45         th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
46         if (th == NULL) {
47                 *verdict = NF_DROP;
48                 return 0;
49         }
50         net = skb_net(skb);
51         /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
52         if (th->syn &&
53             (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
54                                      &iph.daddr, th->dest))) {
55                 int ignored;
56
57                 if (ip_vs_todrop(net_ipvs(net))) {
58                         /*
59                          * It seems that we are very loaded.
60                          * We have to drop this packet :(
61                          */
62                         ip_vs_service_put(svc);
63                         *verdict = NF_DROP;
64                         return 0;
65                 }
66
67                 /*
68                  * Let the virtual server select a real server for the
69                  * incoming connection, and create a connection entry.
70                  */
71                 *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
72                 if (!*cpp && ignored <= 0) {
73                         if (!ignored)
74                                 *verdict = ip_vs_leave(svc, skb, pd);
75                         else {
76                                 ip_vs_service_put(svc);
77                                 *verdict = NF_DROP;
78                         }
79                         return 0;
80                 }
81                 ip_vs_service_put(svc);
82         }
83         /* NF_ACCEPT */
84         return 1;
85 }
86
87
88 static inline void
89 tcp_fast_csum_update(int af, struct tcphdr *tcph,
90                      const union nf_inet_addr *oldip,
91                      const union nf_inet_addr *newip,
92                      __be16 oldport, __be16 newport)
93 {
94 #ifdef CONFIG_IP_VS_IPV6
95         if (af == AF_INET6)
96                 tcph->check =
97                         csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
98                                          ip_vs_check_diff2(oldport, newport,
99                                                 ~csum_unfold(tcph->check))));
100         else
101 #endif
102         tcph->check =
103                 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
104                                  ip_vs_check_diff2(oldport, newport,
105                                                 ~csum_unfold(tcph->check))));
106 }
107
108
109 static inline void
110 tcp_partial_csum_update(int af, struct tcphdr *tcph,
111                      const union nf_inet_addr *oldip,
112                      const union nf_inet_addr *newip,
113                      __be16 oldlen, __be16 newlen)
114 {
115 #ifdef CONFIG_IP_VS_IPV6
116         if (af == AF_INET6)
117                 tcph->check =
118                         ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
119                                          ip_vs_check_diff2(oldlen, newlen,
120                                                 csum_unfold(tcph->check))));
121         else
122 #endif
123         tcph->check =
124                 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
125                                 ip_vs_check_diff2(oldlen, newlen,
126                                                 csum_unfold(tcph->check))));
127 }
128
129
130 static int
131 tcp_snat_handler(struct sk_buff *skb,
132                  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
133 {
134         struct tcphdr *tcph;
135         unsigned int tcphoff;
136         int oldlen;
137         int payload_csum = 0;
138
139 #ifdef CONFIG_IP_VS_IPV6
140         if (cp->af == AF_INET6)
141                 tcphoff = sizeof(struct ipv6hdr);
142         else
143 #endif
144                 tcphoff = ip_hdrlen(skb);
145         oldlen = skb->len - tcphoff;
146
147         /* csum_check requires unshared skb */
148         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
149                 return 0;
150
151         if (unlikely(cp->app != NULL)) {
152                 int ret;
153
154                 /* Some checks before mangling */
155                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
156                         return 0;
157
158                 /* Call application helper if needed */
159                 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
160                         return 0;
161                 /* ret=2: csum update is needed after payload mangling */
162                 if (ret == 1)
163                         oldlen = skb->len - tcphoff;
164                 else
165                         payload_csum = 1;
166         }
167
168         tcph = (void *)skb_network_header(skb) + tcphoff;
169         tcph->source = cp->vport;
170
171         /* Adjust TCP checksums */
172         if (skb->ip_summed == CHECKSUM_PARTIAL) {
173                 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
174                                         htons(oldlen),
175                                         htons(skb->len - tcphoff));
176         } else if (!payload_csum) {
177                 /* Only port and addr are changed, do fast csum update */
178                 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
179                                      cp->dport, cp->vport);
180                 if (skb->ip_summed == CHECKSUM_COMPLETE)
181                         skb->ip_summed = (cp->app && pp->csum_check) ?
182                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
183         } else {
184                 /* full checksum calculation */
185                 tcph->check = 0;
186                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
187 #ifdef CONFIG_IP_VS_IPV6
188                 if (cp->af == AF_INET6)
189                         tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
190                                                       &cp->caddr.in6,
191                                                       skb->len - tcphoff,
192                                                       cp->protocol, skb->csum);
193                 else
194 #endif
195                         tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
196                                                         cp->caddr.ip,
197                                                         skb->len - tcphoff,
198                                                         cp->protocol,
199                                                         skb->csum);
200                 skb->ip_summed = CHECKSUM_UNNECESSARY;
201
202                 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
203                           pp->name, tcph->check,
204                           (char*)&(tcph->check) - (char*)tcph);
205         }
206         return 1;
207 }
208
209
210 static int
211 tcp_dnat_handler(struct sk_buff *skb,
212                  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
213 {
214         struct tcphdr *tcph;
215         unsigned int tcphoff;
216         int oldlen;
217         int payload_csum = 0;
218
219 #ifdef CONFIG_IP_VS_IPV6
220         if (cp->af == AF_INET6)
221                 tcphoff = sizeof(struct ipv6hdr);
222         else
223 #endif
224                 tcphoff = ip_hdrlen(skb);
225         oldlen = skb->len - tcphoff;
226
227         /* csum_check requires unshared skb */
228         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
229                 return 0;
230
231         if (unlikely(cp->app != NULL)) {
232                 int ret;
233
234                 /* Some checks before mangling */
235                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
236                         return 0;
237
238                 /*
239                  *      Attempt ip_vs_app call.
240                  *      It will fix ip_vs_conn and iph ack_seq stuff
241                  */
242                 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
243                         return 0;
244                 /* ret=2: csum update is needed after payload mangling */
245                 if (ret == 1)
246                         oldlen = skb->len - tcphoff;
247                 else
248                         payload_csum = 1;
249         }
250
251         tcph = (void *)skb_network_header(skb) + tcphoff;
252         tcph->dest = cp->dport;
253
254         /*
255          *      Adjust TCP checksums
256          */
257         if (skb->ip_summed == CHECKSUM_PARTIAL) {
258                 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
259                                         htons(oldlen),
260                                         htons(skb->len - tcphoff));
261         } else if (!payload_csum) {
262                 /* Only port and addr are changed, do fast csum update */
263                 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
264                                      cp->vport, cp->dport);
265                 if (skb->ip_summed == CHECKSUM_COMPLETE)
266                         skb->ip_summed = (cp->app && pp->csum_check) ?
267                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
268         } else {
269                 /* full checksum calculation */
270                 tcph->check = 0;
271                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
272 #ifdef CONFIG_IP_VS_IPV6
273                 if (cp->af == AF_INET6)
274                         tcph->check = csum_ipv6_magic(&cp->caddr.in6,
275                                                       &cp->daddr.in6,
276                                                       skb->len - tcphoff,
277                                                       cp->protocol, skb->csum);
278                 else
279 #endif
280                         tcph->check = csum_tcpudp_magic(cp->caddr.ip,
281                                                         cp->daddr.ip,
282                                                         skb->len - tcphoff,
283                                                         cp->protocol,
284                                                         skb->csum);
285                 skb->ip_summed = CHECKSUM_UNNECESSARY;
286         }
287         return 1;
288 }
289
290
291 static int
292 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
293 {
294         unsigned int tcphoff;
295
296 #ifdef CONFIG_IP_VS_IPV6
297         if (af == AF_INET6)
298                 tcphoff = sizeof(struct ipv6hdr);
299         else
300 #endif
301                 tcphoff = ip_hdrlen(skb);
302
303         switch (skb->ip_summed) {
304         case CHECKSUM_NONE:
305                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
306         case CHECKSUM_COMPLETE:
307 #ifdef CONFIG_IP_VS_IPV6
308                 if (af == AF_INET6) {
309                         if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
310                                             &ipv6_hdr(skb)->daddr,
311                                             skb->len - tcphoff,
312                                             ipv6_hdr(skb)->nexthdr,
313                                             skb->csum)) {
314                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
315                                                  "Failed checksum for");
316                                 return 0;
317                         }
318                 } else
319 #endif
320                         if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
321                                               ip_hdr(skb)->daddr,
322                                               skb->len - tcphoff,
323                                               ip_hdr(skb)->protocol,
324                                               skb->csum)) {
325                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
326                                                  "Failed checksum for");
327                                 return 0;
328                         }
329                 break;
330         default:
331                 /* No need to checksum. */
332                 break;
333         }
334
335         return 1;
336 }
337
338
339 #define TCP_DIR_INPUT           0
340 #define TCP_DIR_OUTPUT          4
341 #define TCP_DIR_INPUT_ONLY      8
342
343 static const int tcp_state_off[IP_VS_DIR_LAST] = {
344         [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
345         [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
346         [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
347 };
348
349 /*
350  *      Timeout table[state]
351  */
352 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
353         [IP_VS_TCP_S_NONE]              =       2*HZ,
354         [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
355         [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
356         [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
357         [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
358         [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
359         [IP_VS_TCP_S_CLOSE]             =       10*HZ,
360         [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
361         [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
362         [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
363         [IP_VS_TCP_S_SYNACK]            =       120*HZ,
364         [IP_VS_TCP_S_LAST]              =       2*HZ,
365 };
366
367 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
368         [IP_VS_TCP_S_NONE]              =       "NONE",
369         [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
370         [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
371         [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
372         [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
373         [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
374         [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
375         [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
376         [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
377         [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
378         [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
379         [IP_VS_TCP_S_LAST]              =       "BUG!",
380 };
381
382 #define sNO IP_VS_TCP_S_NONE
383 #define sES IP_VS_TCP_S_ESTABLISHED
384 #define sSS IP_VS_TCP_S_SYN_SENT
385 #define sSR IP_VS_TCP_S_SYN_RECV
386 #define sFW IP_VS_TCP_S_FIN_WAIT
387 #define sTW IP_VS_TCP_S_TIME_WAIT
388 #define sCL IP_VS_TCP_S_CLOSE
389 #define sCW IP_VS_TCP_S_CLOSE_WAIT
390 #define sLA IP_VS_TCP_S_LAST_ACK
391 #define sLI IP_VS_TCP_S_LISTEN
392 #define sSA IP_VS_TCP_S_SYNACK
393
394 struct tcp_states_t {
395         int next_state[IP_VS_TCP_S_LAST];
396 };
397
398 static const char * tcp_state_name(int state)
399 {
400         if (state >= IP_VS_TCP_S_LAST)
401                 return "ERR!";
402         return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
403 }
404
405 static struct tcp_states_t tcp_states [] = {
406 /*      INPUT */
407 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
408 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
409 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
410 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
411 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
412
413 /*      OUTPUT */
414 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
415 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
416 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
417 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
418 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
419
420 /*      INPUT-ONLY */
421 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
422 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
423 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
424 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
425 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
426 };
427
428 static struct tcp_states_t tcp_states_dos [] = {
429 /*      INPUT */
430 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
431 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
432 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
433 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
434 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
435
436 /*      OUTPUT */
437 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
438 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
439 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
440 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
441 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
442
443 /*      INPUT-ONLY */
444 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
445 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
446 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
447 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
448 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
449 };
450
451 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
452 {
453         int on = (flags & 1);           /* secure_tcp */
454
455         /*
456         ** FIXME: change secure_tcp to independent sysctl var
457         ** or make it per-service or per-app because it is valid
458         ** for most if not for all of the applications. Something
459         ** like "capabilities" (flags) for each object.
460         */
461         pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
462 }
463
464 static inline int tcp_state_idx(struct tcphdr *th)
465 {
466         if (th->rst)
467                 return 3;
468         if (th->syn)
469                 return 0;
470         if (th->fin)
471                 return 1;
472         if (th->ack)
473                 return 2;
474         return -1;
475 }
476
477 static inline void
478 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
479               int direction, struct tcphdr *th)
480 {
481         int state_idx;
482         int new_state = IP_VS_TCP_S_CLOSE;
483         int state_off = tcp_state_off[direction];
484
485         /*
486          *    Update state offset to INPUT_ONLY if necessary
487          *    or delete NO_OUTPUT flag if output packet detected
488          */
489         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
490                 if (state_off == TCP_DIR_OUTPUT)
491                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
492                 else
493                         state_off = TCP_DIR_INPUT_ONLY;
494         }
495
496         if ((state_idx = tcp_state_idx(th)) < 0) {
497                 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
498                 goto tcp_state_out;
499         }
500
501         new_state =
502                 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
503
504   tcp_state_out:
505         if (new_state != cp->state) {
506                 struct ip_vs_dest *dest = cp->dest;
507
508                 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
509                               "%s:%d state: %s->%s conn->refcnt:%d\n",
510                               pd->pp->name,
511                               ((state_off == TCP_DIR_OUTPUT) ?
512                                "output " : "input "),
513                               th->syn ? 'S' : '.',
514                               th->fin ? 'F' : '.',
515                               th->ack ? 'A' : '.',
516                               th->rst ? 'R' : '.',
517                               IP_VS_DBG_ADDR(cp->af, &cp->daddr),
518                               ntohs(cp->dport),
519                               IP_VS_DBG_ADDR(cp->af, &cp->caddr),
520                               ntohs(cp->cport),
521                               tcp_state_name(cp->state),
522                               tcp_state_name(new_state),
523                               atomic_read(&cp->refcnt));
524
525                 if (dest) {
526                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
527                             (new_state != IP_VS_TCP_S_ESTABLISHED)) {
528                                 atomic_dec(&dest->activeconns);
529                                 atomic_inc(&dest->inactconns);
530                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
531                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
532                                    (new_state == IP_VS_TCP_S_ESTABLISHED)) {
533                                 atomic_inc(&dest->activeconns);
534                                 atomic_dec(&dest->inactconns);
535                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
536                         }
537                 }
538         }
539
540         if (likely(pd))
541                 cp->timeout = pd->timeout_table[cp->state = new_state];
542         else    /* What to do ? */
543                 cp->timeout = tcp_timeouts[cp->state = new_state];
544 }
545
546 /*
547  *      Handle state transitions
548  */
549 static void
550 tcp_state_transition(struct ip_vs_conn *cp, int direction,
551                      const struct sk_buff *skb,
552                      struct ip_vs_proto_data *pd)
553 {
554         struct tcphdr _tcph, *th;
555
556 #ifdef CONFIG_IP_VS_IPV6
557         int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
558 #else
559         int ihl = ip_hdrlen(skb);
560 #endif
561
562         th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
563         if (th == NULL)
564                 return;
565
566         spin_lock(&cp->lock);
567         set_tcp_state(pd, cp, direction, th);
568         spin_unlock(&cp->lock);
569 }
570
571 static inline __u16 tcp_app_hashkey(__be16 port)
572 {
573         return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
574                 & TCP_APP_TAB_MASK;
575 }
576
577
578 static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
579 {
580         struct ip_vs_app *i;
581         __u16 hash;
582         __be16 port = inc->port;
583         int ret = 0;
584         struct netns_ipvs *ipvs = net_ipvs(net);
585         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
586
587         hash = tcp_app_hashkey(port);
588
589         spin_lock_bh(&ipvs->tcp_app_lock);
590         list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
591                 if (i->port == port) {
592                         ret = -EEXIST;
593                         goto out;
594                 }
595         }
596         list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
597         atomic_inc(&pd->appcnt);
598
599   out:
600         spin_unlock_bh(&ipvs->tcp_app_lock);
601         return ret;
602 }
603
604
605 static void
606 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
607 {
608         struct netns_ipvs *ipvs = net_ipvs(net);
609         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
610
611         spin_lock_bh(&ipvs->tcp_app_lock);
612         atomic_dec(&pd->appcnt);
613         list_del(&inc->p_list);
614         spin_unlock_bh(&ipvs->tcp_app_lock);
615 }
616
617
618 static int
619 tcp_app_conn_bind(struct ip_vs_conn *cp)
620 {
621         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
622         int hash;
623         struct ip_vs_app *inc;
624         int result = 0;
625
626         /* Default binding: bind app only for NAT */
627         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
628                 return 0;
629
630         /* Lookup application incarnations and bind the right one */
631         hash = tcp_app_hashkey(cp->vport);
632
633         spin_lock(&ipvs->tcp_app_lock);
634         list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
635                 if (inc->port == cp->vport) {
636                         if (unlikely(!ip_vs_app_inc_get(inc)))
637                                 break;
638                         spin_unlock(&ipvs->tcp_app_lock);
639
640                         IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
641                                       "%s:%u to app %s on port %u\n",
642                                       __func__,
643                                       IP_VS_DBG_ADDR(cp->af, &cp->caddr),
644                                       ntohs(cp->cport),
645                                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
646                                       ntohs(cp->vport),
647                                       inc->name, ntohs(inc->port));
648
649                         cp->app = inc;
650                         if (inc->init_conn)
651                                 result = inc->init_conn(inc, cp);
652                         goto out;
653                 }
654         }
655         spin_unlock(&ipvs->tcp_app_lock);
656
657   out:
658         return result;
659 }
660
661
662 /*
663  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
664  */
665 void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
666 {
667         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
668
669         spin_lock(&cp->lock);
670         cp->state = IP_VS_TCP_S_LISTEN;
671         cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
672                            : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
673         spin_unlock(&cp->lock);
674 }
675
676 /* ---------------------------------------------
677  *   timeouts is netns related now.
678  * ---------------------------------------------
679  */
680 static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
681 {
682         struct netns_ipvs *ipvs = net_ipvs(net);
683
684         ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
685         spin_lock_init(&ipvs->tcp_app_lock);
686         pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
687                                                         sizeof(tcp_timeouts));
688         pd->tcp_state_table =  tcp_states;
689 }
690
691 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
692 {
693         kfree(pd->timeout_table);
694 }
695
696
697 struct ip_vs_protocol ip_vs_protocol_tcp = {
698         .name =                 "TCP",
699         .protocol =             IPPROTO_TCP,
700         .num_states =           IP_VS_TCP_S_LAST,
701         .dont_defrag =          0,
702         .init =                 NULL,
703         .exit =                 NULL,
704         .init_netns =           __ip_vs_tcp_init,
705         .exit_netns =           __ip_vs_tcp_exit,
706         .register_app =         tcp_register_app,
707         .unregister_app =       tcp_unregister_app,
708         .conn_schedule =        tcp_conn_schedule,
709         .conn_in_get =          ip_vs_conn_in_get_proto,
710         .conn_out_get =         ip_vs_conn_out_get_proto,
711         .snat_handler =         tcp_snat_handler,
712         .dnat_handler =         tcp_dnat_handler,
713         .csum_check =           tcp_csum_check,
714         .state_name =           tcp_state_name,
715         .state_transition =     tcp_state_transition,
716         .app_conn_bind =        tcp_app_conn_bind,
717         .debug_packet =         ip_vs_tcpudp_debug_packet,
718         .timeout_change =       tcp_timeout_change,
719 };