2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/config.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/init.h>
27 #include <linux/types.h>
28 #include <linux/errno.h>
30 #include <linux/sysctl.h>
31 #include <linux/proc_fs.h>
32 #include <linux/timer.h>
33 #include <linux/swap.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
37 #include <linux/netfilter.h>
38 #include <linux/netfilter_ipv4.h>
43 #include <asm/uaccess.h>
45 #include <net/ip_vs.h>
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DECLARE_MUTEX(__ip_vs_mutex);
50 /* lock for service table */
51 static rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;
53 /* lock for table with the real services */
54 static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;
56 /* lock for state and timeout tables */
57 static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;
59 /* lock for drop entry handling */
60 static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;
62 /* lock for drop packet handling */
63 static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
82 int sysctl_ip_vs_nat_icmp_send = 0;
85 #ifdef CONFIG_IP_VS_DEBUG
86 static int sysctl_ip_vs_debug_level = 0;
88 int ip_vs_get_debug_level(void)
90 return sysctl_ip_vs_debug_level;
95 * update_defense_level is called from timer bh and from sysctl.
97 static void update_defense_level(void)
100 static int old_secure_tcp = 0;
105 /* we only count free and buffered memory (in pages) */
107 availmem = i.freeram + i.bufferram;
108 /* however in linux 2.5 the i.bufferram is total page cache size,
110 /* si_swapinfo(&i); */
111 /* availmem = availmem - (i.totalswap - i.freeswap); */
113 nomem = (availmem < sysctl_ip_vs_amemthresh);
116 spin_lock(&__ip_vs_dropentry_lock);
117 switch (sysctl_ip_vs_drop_entry) {
119 atomic_set(&ip_vs_dropentry, 0);
123 atomic_set(&ip_vs_dropentry, 1);
124 sysctl_ip_vs_drop_entry = 2;
126 atomic_set(&ip_vs_dropentry, 0);
131 atomic_set(&ip_vs_dropentry, 1);
133 atomic_set(&ip_vs_dropentry, 0);
134 sysctl_ip_vs_drop_entry = 1;
138 atomic_set(&ip_vs_dropentry, 1);
141 spin_unlock(&__ip_vs_dropentry_lock);
144 spin_lock(&__ip_vs_droppacket_lock);
145 switch (sysctl_ip_vs_drop_packet) {
151 ip_vs_drop_rate = ip_vs_drop_counter
152 = sysctl_ip_vs_amemthresh /
153 (sysctl_ip_vs_amemthresh-availmem);
154 sysctl_ip_vs_drop_packet = 2;
161 ip_vs_drop_rate = ip_vs_drop_counter
162 = sysctl_ip_vs_amemthresh /
163 (sysctl_ip_vs_amemthresh-availmem);
166 sysctl_ip_vs_drop_packet = 1;
170 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173 spin_unlock(&__ip_vs_droppacket_lock);
176 write_lock(&__ip_vs_securetcp_lock);
177 switch (sysctl_ip_vs_secure_tcp) {
179 if (old_secure_tcp >= 2)
184 if (old_secure_tcp < 2)
186 sysctl_ip_vs_secure_tcp = 2;
188 if (old_secure_tcp >= 2)
194 if (old_secure_tcp < 2)
197 if (old_secure_tcp >= 2)
199 sysctl_ip_vs_secure_tcp = 1;
203 if (old_secure_tcp < 2)
207 old_secure_tcp = sysctl_ip_vs_secure_tcp;
209 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
210 write_unlock(&__ip_vs_securetcp_lock);
215 * Timer for checking the defense
217 static struct timer_list defense_timer;
218 #define DEFENSE_TIMER_PERIOD 1*HZ
220 static void defense_timer_handler(unsigned long data)
222 update_defense_level();
223 if (atomic_read(&ip_vs_dropentry))
224 ip_vs_random_dropentry();
226 mod_timer(&defense_timer, jiffies + DEFENSE_TIMER_PERIOD);
231 ip_vs_use_count_inc(void)
233 return try_module_get(THIS_MODULE);
237 ip_vs_use_count_dec(void)
239 module_put(THIS_MODULE);
244 * Hash table: for virtual service lookups
246 #define IP_VS_SVC_TAB_BITS 8
247 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
248 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
250 /* the service table hashed by <protocol, addr, port> */
251 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
252 /* the service table hashed by fwmark */
253 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
256 * Hash table: for real service lookups
258 #define IP_VS_RTAB_BITS 4
259 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
260 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
262 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
265 * Trash for destinations
267 static LIST_HEAD(ip_vs_dest_trash);
270 * FTP & NULL virtual service counters
272 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
273 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
277 * Returns hash value for virtual service
279 static __inline__ unsigned
280 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
282 register unsigned porth = ntohs(port);
284 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
285 & IP_VS_SVC_TAB_MASK;
289 * Returns hash value of fwmark for virtual service lookup
291 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
293 return fwmark & IP_VS_SVC_TAB_MASK;
297 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
298 * or in the ip_vs_svc_fwm_table by fwmark.
299 * Should be called with locked tables.
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
305 if (svc->flags & IP_VS_SVC_F_HASHED) {
306 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
307 "called from %p\n", __builtin_return_address(0));
311 if (svc->fwmark == 0) {
313 * Hash it by <protocol,addr,port> in ip_vs_svc_table
315 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
316 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
319 * Hash it by fwmark in ip_vs_svc_fwm_table
321 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
322 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
325 svc->flags |= IP_VS_SVC_F_HASHED;
326 /* increase its refcnt because it is referenced by the svc table */
327 atomic_inc(&svc->refcnt);
333 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
334 * Should be called with locked tables.
336 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
338 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
339 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
340 "called from %p\n", __builtin_return_address(0));
344 if (svc->fwmark == 0) {
345 /* Remove it from the ip_vs_svc_table table */
346 list_del(&svc->s_list);
348 /* Remove it from the ip_vs_svc_fwm_table table */
349 list_del(&svc->f_list);
352 svc->flags &= ~IP_VS_SVC_F_HASHED;
353 atomic_dec(&svc->refcnt);
359 * Get service by {proto,addr,port} in the service table.
361 static __inline__ struct ip_vs_service *
362 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
365 struct ip_vs_service *svc;
367 /* Check for "full" addressed entries */
368 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
370 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
371 if ((svc->addr == vaddr)
372 && (svc->port == vport)
373 && (svc->protocol == protocol)) {
375 atomic_inc(&svc->usecnt);
385 * Get service by {fwmark} in the service table.
387 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
390 struct ip_vs_service *svc;
392 /* Check for fwmark addressed entries */
393 hash = ip_vs_svc_fwm_hashkey(fwmark);
395 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
396 if (svc->fwmark == fwmark) {
398 atomic_inc(&svc->usecnt);
406 struct ip_vs_service *
407 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
409 struct ip_vs_service *svc;
411 read_lock(&__ip_vs_svc_lock);
414 * Check the table hashed by fwmark first
416 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
420 * Check the table hashed by <protocol,addr,port>
421 * for "full" addressed entries
423 svc = __ip_vs_service_get(protocol, vaddr, vport);
426 && protocol == IPPROTO_TCP
427 && atomic_read(&ip_vs_ftpsvc_counter)
428 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
430 * Check if ftp service entry exists, the packet
431 * might belong to FTP data connections.
433 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
437 && atomic_read(&ip_vs_nullsvc_counter)) {
439 * Check if the catch-all port (port zero) exists
441 svc = __ip_vs_service_get(protocol, vaddr, 0);
445 read_unlock(&__ip_vs_svc_lock);
447 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
448 fwmark, ip_vs_proto_name(protocol),
449 NIPQUAD(vaddr), ntohs(vport),
450 svc?"hit":"not hit");
457 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
459 atomic_inc(&svc->refcnt);
464 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
466 struct ip_vs_service *svc = dest->svc;
469 if (atomic_dec_and_test(&svc->refcnt))
475 * Returns hash value for real service
477 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
479 register unsigned porth = ntohs(port);
481 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
487 * should be called with locked tables.
489 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
493 if (!list_empty(&dest->d_list)) {
498 * Hash by proto,addr,port,
499 * which are the parameters of the real service.
501 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
502 list_add(&dest->d_list, &ip_vs_rtable[hash]);
508 * UNhashes ip_vs_dest from ip_vs_rtable.
509 * should be called with locked tables.
511 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
514 * Remove it from the ip_vs_rtable table.
516 if (!list_empty(&dest->d_list)) {
517 list_del(&dest->d_list);
518 INIT_LIST_HEAD(&dest->d_list);
525 * Lookup real service by <proto,addr,port> in the real service table.
528 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
531 struct ip_vs_dest *dest;
534 * Check for "full" addressed entries
535 * Return the first found entry
537 hash = ip_vs_rs_hashkey(daddr, dport);
539 read_lock(&__ip_vs_rs_lock);
540 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
541 if ((dest->addr == daddr)
542 && (dest->port == dport)
543 && ((dest->protocol == protocol) ||
546 read_unlock(&__ip_vs_rs_lock);
550 read_unlock(&__ip_vs_rs_lock);
556 * Lookup destination by {addr,port} in the given service
558 static struct ip_vs_dest *
559 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
561 struct ip_vs_dest *dest;
564 * Find the destination for the given service
566 list_for_each_entry(dest, &svc->destinations, n_list) {
567 if ((dest->addr == daddr) && (dest->port == dport)) {
578 * Lookup dest by {svc,addr,port} in the destination trash.
579 * The destination trash is used to hold the destinations that are removed
580 * from the service table but are still referenced by some conn entries.
581 * The reason to add the destination trash is when the dest is temporary
582 * down (either by administrator or by monitor program), the dest can be
583 * picked back from the trash, the remaining connections to the dest can
584 * continue, and the counting information of the dest is also useful for
587 static struct ip_vs_dest *
588 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
590 struct ip_vs_dest *dest, *nxt;
593 * Find the destination in trash
595 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
596 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
599 NIPQUAD(dest->addr), ntohs(dest->port),
600 atomic_read(&dest->refcnt));
601 if (dest->addr == daddr &&
602 dest->port == dport &&
603 dest->vfwmark == svc->fwmark &&
604 dest->protocol == svc->protocol &&
606 (dest->vaddr == svc->addr &&
607 dest->vport == svc->port))) {
613 * Try to purge the destination from trash if not referenced
615 if (atomic_read(&dest->refcnt) == 1) {
616 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
619 NIPQUAD(dest->addr), ntohs(dest->port));
620 list_del(&dest->n_list);
621 ip_vs_dst_reset(dest);
622 __ip_vs_unbind_svc(dest);
632 * Clean up all the destinations in the trash
633 * Called by the ip_vs_control_cleanup()
635 * When the ip_vs_control_clearup is activated by ipvs module exit,
636 * the service tables must have been flushed and all the connections
637 * are expired, and the refcnt of each destination in the trash must
638 * be 1, so we simply release them here.
640 static void ip_vs_trash_cleanup(void)
642 struct ip_vs_dest *dest, *nxt;
644 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
645 list_del(&dest->n_list);
646 ip_vs_dst_reset(dest);
647 __ip_vs_unbind_svc(dest);
654 ip_vs_zero_stats(struct ip_vs_stats *stats)
656 spin_lock_bh(&stats->lock);
657 memset(stats, 0, (char *)&stats->lock - (char *)stats);
658 spin_unlock_bh(&stats->lock);
659 ip_vs_zero_estimator(stats);
663 * Update a destination in the given service
666 __ip_vs_update_dest(struct ip_vs_service *svc,
667 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
671 /* set the weight and the flags */
672 atomic_set(&dest->weight, udest->weight);
673 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
675 /* check if local node and update the flags */
676 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
677 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
678 | IP_VS_CONN_F_LOCALNODE;
681 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
682 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
683 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
686 * Put the real service in ip_vs_rtable if not present.
687 * For now only for NAT!
689 write_lock_bh(&__ip_vs_rs_lock);
691 write_unlock_bh(&__ip_vs_rs_lock);
693 atomic_set(&dest->conn_flags, conn_flags);
695 /* bind the service */
697 __ip_vs_bind_svc(dest, svc);
699 if (dest->svc != svc) {
700 __ip_vs_unbind_svc(dest);
701 ip_vs_zero_stats(&dest->stats);
702 __ip_vs_bind_svc(dest, svc);
706 /* set the dest status flags */
707 dest->flags |= IP_VS_DEST_F_AVAILABLE;
709 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
710 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
711 dest->u_threshold = udest->u_threshold;
712 dest->l_threshold = udest->l_threshold;
717 * Create a destination for the given service
720 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
721 struct ip_vs_dest **dest_p)
723 struct ip_vs_dest *dest;
728 atype = inet_addr_type(udest->addr);
729 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
732 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
734 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
737 memset(dest, 0, sizeof(struct ip_vs_dest));
739 dest->protocol = svc->protocol;
740 dest->vaddr = svc->addr;
741 dest->vport = svc->port;
742 dest->vfwmark = svc->fwmark;
743 dest->addr = udest->addr;
744 dest->port = udest->port;
746 atomic_set(&dest->activeconns, 0);
747 atomic_set(&dest->inactconns, 0);
748 atomic_set(&dest->persistconns, 0);
749 atomic_set(&dest->refcnt, 0);
751 INIT_LIST_HEAD(&dest->d_list);
752 dest->dst_lock = SPIN_LOCK_UNLOCKED;
753 dest->stats.lock = SPIN_LOCK_UNLOCKED;
754 __ip_vs_update_dest(svc, dest, udest);
755 ip_vs_new_estimator(&dest->stats);
765 * Add a destination into an existing service
768 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
770 struct ip_vs_dest *dest;
771 __u32 daddr = udest->addr;
772 __u16 dport = udest->port;
777 if (udest->weight < 0) {
778 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
782 if (udest->l_threshold > udest->u_threshold) {
783 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
784 "upper threshold\n");
789 * Check if the dest already exists in the list
791 dest = ip_vs_lookup_dest(svc, daddr, dport);
793 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
798 * Check if the dest already exists in the trash and
799 * is from the same service
801 dest = ip_vs_trash_get_dest(svc, daddr, dport);
803 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
804 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
805 NIPQUAD(daddr), ntohs(dport),
806 atomic_read(&dest->refcnt),
808 NIPQUAD(dest->vaddr),
810 __ip_vs_update_dest(svc, dest, udest);
813 * Get the destination from the trash
815 list_del(&dest->n_list);
817 ip_vs_new_estimator(&dest->stats);
819 write_lock_bh(&__ip_vs_svc_lock);
822 * Wait until all other svc users go away.
824 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
826 list_add(&dest->n_list, &svc->destinations);
829 /* call the update_service function of its scheduler */
830 svc->scheduler->update_service(svc);
832 write_unlock_bh(&__ip_vs_svc_lock);
837 * Allocate and initialize the dest structure
839 ret = ip_vs_new_dest(svc, udest, &dest);
845 * Add the dest entry into the list
847 atomic_inc(&dest->refcnt);
849 write_lock_bh(&__ip_vs_svc_lock);
852 * Wait until all other svc users go away.
854 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
856 list_add(&dest->n_list, &svc->destinations);
859 /* call the update_service function of its scheduler */
860 svc->scheduler->update_service(svc);
862 write_unlock_bh(&__ip_vs_svc_lock);
871 * Edit a destination in the given service
874 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
876 struct ip_vs_dest *dest;
877 __u32 daddr = udest->addr;
878 __u16 dport = udest->port;
882 if (udest->weight < 0) {
883 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
887 if (udest->l_threshold > udest->u_threshold) {
888 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
889 "upper threshold\n");
894 * Lookup the destination list
896 dest = ip_vs_lookup_dest(svc, daddr, dport);
898 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
902 __ip_vs_update_dest(svc, dest, udest);
904 /* call the update_service, because server weight may be changed */
905 svc->scheduler->update_service(svc);
914 * Delete a destination (must be already unlinked from the service)
916 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
918 ip_vs_kill_estimator(&dest->stats);
921 * Remove it from the d-linked list with the real services.
923 write_lock_bh(&__ip_vs_rs_lock);
924 ip_vs_rs_unhash(dest);
925 write_unlock_bh(&__ip_vs_rs_lock);
928 * Decrease the refcnt of the dest, and free the dest
929 * if nobody refers to it (refcnt=0). Otherwise, throw
930 * the destination into the trash.
932 if (atomic_dec_and_test(&dest->refcnt)) {
933 ip_vs_dst_reset(dest);
934 /* simply decrease svc->refcnt here, let the caller check
935 and release the service if nobody refers to it.
936 Only user context can release destination and service,
937 and only one user context can update virtual service at a
938 time, so the operation here is OK */
939 atomic_dec(&dest->svc->refcnt);
942 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
943 NIPQUAD(dest->addr), ntohs(dest->port),
944 atomic_read(&dest->refcnt));
945 list_add(&dest->n_list, &ip_vs_dest_trash);
946 atomic_inc(&dest->refcnt);
952 * Unlink a destination from the given service
954 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
955 struct ip_vs_dest *dest,
958 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
961 * Remove it from the d-linked destination list.
963 list_del(&dest->n_list);
967 * Call the update_service function of its scheduler
969 svc->scheduler->update_service(svc);
975 * Delete a destination server in the given service
978 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
980 struct ip_vs_dest *dest;
981 __u32 daddr = udest->addr;
982 __u16 dport = udest->port;
986 dest = ip_vs_lookup_dest(svc, daddr, dport);
988 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
992 write_lock_bh(&__ip_vs_svc_lock);
995 * Wait until all other svc users go away.
997 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1000 * Unlink dest from the service
1002 __ip_vs_unlink_dest(svc, dest, 1);
1004 write_unlock_bh(&__ip_vs_svc_lock);
1007 * Delete the destination
1009 __ip_vs_del_dest(dest);
1018 * Add a service into the service hash table
1021 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1024 struct ip_vs_scheduler *sched = NULL;
1025 struct ip_vs_service *svc = NULL;
1027 /* increase the module use count */
1028 ip_vs_use_count_inc();
1030 /* Lookup the scheduler by 'u->sched_name' */
1031 sched = ip_vs_scheduler_get(u->sched_name);
1032 if (sched == NULL) {
1033 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1039 svc = (struct ip_vs_service *)
1040 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1042 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1046 memset(svc, 0, sizeof(struct ip_vs_service));
1048 /* I'm the first user of the service */
1049 atomic_set(&svc->usecnt, 1);
1050 atomic_set(&svc->refcnt, 0);
1052 svc->protocol = u->protocol;
1053 svc->addr = u->addr;
1054 svc->port = u->port;
1055 svc->fwmark = u->fwmark;
1056 svc->flags = u->flags;
1057 svc->timeout = u->timeout * HZ;
1058 svc->netmask = u->netmask;
1060 INIT_LIST_HEAD(&svc->destinations);
1061 svc->sched_lock = RW_LOCK_UNLOCKED;
1062 svc->stats.lock = SPIN_LOCK_UNLOCKED;
1064 /* Bind the scheduler */
1065 ret = ip_vs_bind_scheduler(svc, sched);
1070 /* Update the virtual service counters */
1071 if (svc->port == FTPPORT)
1072 atomic_inc(&ip_vs_ftpsvc_counter);
1073 else if (svc->port == 0)
1074 atomic_inc(&ip_vs_nullsvc_counter);
1076 ip_vs_new_estimator(&svc->stats);
1077 ip_vs_num_services++;
1079 /* Hash the service into the service table */
1080 write_lock_bh(&__ip_vs_svc_lock);
1081 ip_vs_svc_hash(svc);
1082 write_unlock_bh(&__ip_vs_svc_lock);
1090 ip_vs_unbind_scheduler(svc);
1093 ip_vs_app_inc_put(svc->inc);
1098 ip_vs_scheduler_put(sched);
1101 /* decrease the module use count */
1102 ip_vs_use_count_dec();
1109 * Edit a service and bind it with a new scheduler
1112 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1114 struct ip_vs_scheduler *sched, *old_sched;
1118 * Lookup the scheduler, by 'u->sched_name'
1120 sched = ip_vs_scheduler_get(u->sched_name);
1121 if (sched == NULL) {
1122 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1128 write_lock_bh(&__ip_vs_svc_lock);
1131 * Wait until all other svc users go away.
1133 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1136 * Set the flags and timeout value
1138 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1139 svc->timeout = u->timeout * HZ;
1140 svc->netmask = u->netmask;
1142 old_sched = svc->scheduler;
1143 if (sched != old_sched) {
1145 * Unbind the old scheduler
1147 if ((ret = ip_vs_unbind_scheduler(svc))) {
1153 * Bind the new scheduler
1155 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1157 * If ip_vs_bind_scheduler fails, restore the old
1159 * The main reason of failure is out of memory.
1161 * The question is if the old scheduler can be
1162 * restored all the time. TODO: if it cannot be
1163 * restored some time, we must delete the service,
1164 * otherwise the system may crash.
1166 ip_vs_bind_scheduler(svc, old_sched);
1173 write_unlock_bh(&__ip_vs_svc_lock);
1176 ip_vs_scheduler_put(old_sched);
1183 * Delete a service from the service list
1184 * - The service must be unlinked, unlocked and not referenced!
1185 * - We are called under _bh lock
1187 static void __ip_vs_del_service(struct ip_vs_service *svc)
1189 struct ip_vs_dest *dest, *nxt;
1190 struct ip_vs_scheduler *old_sched;
1192 ip_vs_num_services--;
1193 ip_vs_kill_estimator(&svc->stats);
1195 /* Unbind scheduler */
1196 old_sched = svc->scheduler;
1197 ip_vs_unbind_scheduler(svc);
1199 ip_vs_scheduler_put(old_sched);
1201 /* Unbind app inc */
1203 ip_vs_app_inc_put(svc->inc);
1208 * Unlink the whole destination list
1210 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1211 __ip_vs_unlink_dest(svc, dest, 0);
1212 __ip_vs_del_dest(dest);
1216 * Update the virtual service counters
1218 if (svc->port == FTPPORT)
1219 atomic_dec(&ip_vs_ftpsvc_counter);
1220 else if (svc->port == 0)
1221 atomic_dec(&ip_vs_nullsvc_counter);
1224 * Free the service if nobody refers to it
1226 if (atomic_read(&svc->refcnt) == 0)
1229 /* decrease the module use count */
1230 ip_vs_use_count_dec();
1234 * Delete a service from the service list
1236 static int ip_vs_del_service(struct ip_vs_service *svc)
1242 * Unhash it from the service table
1244 write_lock_bh(&__ip_vs_svc_lock);
1246 ip_vs_svc_unhash(svc);
1249 * Wait until all the svc users go away.
1251 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1253 __ip_vs_del_service(svc);
1255 write_unlock_bh(&__ip_vs_svc_lock);
1262 * Flush all the virtual services
1264 static int ip_vs_flush(void)
1267 struct ip_vs_service *svc, *nxt;
1270 * Flush the service table hashed by <protocol,addr,port>
1272 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1273 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1274 write_lock_bh(&__ip_vs_svc_lock);
1275 ip_vs_svc_unhash(svc);
1277 * Wait until all the svc users go away.
1279 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1280 __ip_vs_del_service(svc);
1281 write_unlock_bh(&__ip_vs_svc_lock);
1286 * Flush the service table hashed by fwmark
1288 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1289 list_for_each_entry_safe(svc, nxt,
1290 &ip_vs_svc_fwm_table[idx], f_list) {
1291 write_lock_bh(&__ip_vs_svc_lock);
1292 ip_vs_svc_unhash(svc);
1294 * Wait until all the svc users go away.
1296 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1297 __ip_vs_del_service(svc);
1298 write_unlock_bh(&__ip_vs_svc_lock);
1307 * Zero counters in a service or all services
1309 static int ip_vs_zero_service(struct ip_vs_service *svc)
1311 struct ip_vs_dest *dest;
1313 write_lock_bh(&__ip_vs_svc_lock);
1314 list_for_each_entry(dest, &svc->destinations, n_list) {
1315 ip_vs_zero_stats(&dest->stats);
1317 ip_vs_zero_stats(&svc->stats);
1318 write_unlock_bh(&__ip_vs_svc_lock);
1322 static int ip_vs_zero_all(void)
1325 struct ip_vs_service *svc;
1327 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1328 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1329 ip_vs_zero_service(svc);
1333 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1334 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1335 ip_vs_zero_service(svc);
1339 ip_vs_zero_stats(&ip_vs_stats);
1345 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1346 void *buffer, size_t *lenp)
1348 int *valp = table->data;
1352 rc = proc_dointvec(table, write, filp, buffer, lenp);
1353 if (write && (*valp != val)) {
1354 if ((*valp < 0) || (*valp > 3)) {
1355 /* Restore the correct value */
1359 update_defense_level();
1368 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1369 void *buffer, size_t *lenp)
1371 int *valp = table->data;
1375 /* backup the value first */
1376 memcpy(val, valp, sizeof(val));
1378 rc = proc_dointvec(table, write, filp, buffer, lenp);
1379 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1380 /* Restore the correct value */
1381 memcpy(valp, val, sizeof(val));
1388 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1390 struct ip_vs_sysctl_table {
1391 struct ctl_table_header *sysctl_header;
1392 ctl_table vs_vars[NET_IPV4_VS_LAST];
1393 ctl_table vs_dir[2];
1394 ctl_table ipv4_dir[2];
1395 ctl_table root_dir[2];
1398 static struct ip_vs_sysctl_table ipv4_vs_table = {
1400 {{NET_IPV4_VS_AMEMTHRESH, "amemthresh",
1401 &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
1403 #ifdef CONFIG_IP_VS_DEBUG
1404 {NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
1405 &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
1408 {NET_IPV4_VS_AMDROPRATE, "am_droprate",
1409 &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
1411 {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
1412 &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
1413 &proc_do_defense_mode},
1414 {NET_IPV4_VS_DROP_PACKET, "drop_packet",
1415 &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
1416 &proc_do_defense_mode},
1417 {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
1418 &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
1419 &proc_do_defense_mode},
1421 {NET_IPV4_VS_TO_ES, "timeout_established",
1422 &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1423 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1424 {NET_IPV4_VS_TO_SS, "timeout_synsent",
1425 &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1426 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1427 {NET_IPV4_VS_TO_SR, "timeout_synrecv",
1428 &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1429 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1430 {NET_IPV4_VS_TO_FW, "timeout_finwait",
1431 &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1432 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1433 {NET_IPV4_VS_TO_TW, "timeout_timewait",
1434 &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1435 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1436 {NET_IPV4_VS_TO_CL, "timeout_close",
1437 &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1438 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1439 {NET_IPV4_VS_TO_CW, "timeout_closewait",
1440 &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1441 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1442 {NET_IPV4_VS_TO_LA, "timeout_lastack",
1443 &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1444 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1445 {NET_IPV4_VS_TO_LI, "timeout_listen",
1446 &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1447 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1448 {NET_IPV4_VS_TO_SA, "timeout_synack",
1449 &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1450 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1451 {NET_IPV4_VS_TO_UDP, "timeout_udp",
1452 &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1453 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1454 {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
1455 &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1456 sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
1458 {NET_IPV4_VS_CACHE_BYPASS, "cache_bypass",
1459 &sysctl_ip_vs_cache_bypass, sizeof(int), 0644, NULL,
1461 {NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn",
1462 &sysctl_ip_vs_expire_nodest_conn, sizeof(int), 0644, NULL,
1464 {NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold",
1465 &sysctl_ip_vs_sync_threshold, sizeof(sysctl_ip_vs_sync_threshold),
1466 0644, NULL, &proc_do_sync_threshold},
1467 {NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send",
1468 &sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL,
1471 {{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars},
1473 {{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir},
1475 {{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir},
1479 #ifdef CONFIG_PROC_FS
1482 struct list_head *table;
1487 * Write the contents of the VS rule table to a PROCfs file.
1488 * (It is kept just for backward compatibility)
1490 static inline const char *ip_vs_fwd_name(unsigned flags)
1492 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1493 case IP_VS_CONN_F_LOCALNODE:
1495 case IP_VS_CONN_F_TUNNEL:
1497 case IP_VS_CONN_F_DROUTE:
1505 /* Get the Nth entry in the two lists */
1506 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1508 struct ip_vs_iter *iter = seq->private;
1510 struct ip_vs_service *svc;
1512 /* look in hash by protocol */
1513 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1514 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1516 iter->table = ip_vs_svc_table;
1523 /* keep looking in fwmark */
1524 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1525 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1527 iter->table = ip_vs_svc_fwm_table;
1537 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1540 read_lock_bh(&__ip_vs_svc_lock);
1541 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1545 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1547 struct list_head *e;
1548 struct ip_vs_iter *iter;
1549 struct ip_vs_service *svc;
1552 if (v == SEQ_START_TOKEN)
1553 return ip_vs_info_array(seq,0);
1556 iter = seq->private;
1558 if (iter->table == ip_vs_svc_table) {
1559 /* next service in table hashed by protocol */
1560 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1561 return list_entry(e, struct ip_vs_service, s_list);
1564 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1565 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1571 iter->table = ip_vs_svc_fwm_table;
1576 /* next service in hashed by fwmark */
1577 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1578 return list_entry(e, struct ip_vs_service, f_list);
1581 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1582 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1590 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1592 read_unlock_bh(&__ip_vs_svc_lock);
1596 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1598 if (v == SEQ_START_TOKEN) {
1600 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1601 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1603 "Prot LocalAddress:Port Scheduler Flags\n");
1605 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1607 const struct ip_vs_service *svc = v;
1608 const struct ip_vs_iter *iter = seq->private;
1609 const struct ip_vs_dest *dest;
1611 if (iter->table == ip_vs_svc_table)
1612 seq_printf(seq, "%s %08X:%04X %s ",
1613 ip_vs_proto_name(svc->protocol),
1616 svc->scheduler->name);
1618 seq_printf(seq, "FWM %08X %s ",
1619 svc->fwmark, svc->scheduler->name);
1621 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1622 seq_printf(seq, "persistent %d %08X\n",
1624 ntohl(svc->netmask));
1626 seq_putc(seq, '\n');
1628 list_for_each_entry(dest, &svc->destinations, n_list) {
1630 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1631 ntohl(dest->addr), ntohs(dest->port),
1632 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1633 atomic_read(&dest->weight),
1634 atomic_read(&dest->activeconns),
1635 atomic_read(&dest->inactconns));
1641 static struct seq_operations ip_vs_info_seq_ops = {
1642 .start = ip_vs_info_seq_start,
1643 .next = ip_vs_info_seq_next,
1644 .stop = ip_vs_info_seq_stop,
1645 .show = ip_vs_info_seq_show,
1648 static int ip_vs_info_open(struct inode *inode, struct file *file)
1650 struct seq_file *seq;
1652 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1657 rc = seq_open(file, &ip_vs_info_seq_ops);
1661 seq = file->private_data;
1663 memset(s, 0, sizeof(*s));
1671 static struct file_operations ip_vs_info_fops = {
1672 .owner = THIS_MODULE,
1673 .open = ip_vs_info_open,
1675 .llseek = seq_lseek,
1676 .release = seq_release_private,
1681 struct ip_vs_stats ip_vs_stats;
1683 #ifdef CONFIG_PROC_FS
1684 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1687 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1689 " Total Incoming Outgoing Incoming Outgoing\n");
1691 " Conns Packets Packets Bytes Bytes\n");
1693 spin_lock_bh(&ip_vs_stats.lock);
1694 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1695 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1696 (unsigned long long) ip_vs_stats.inbytes,
1697 (unsigned long long) ip_vs_stats.outbytes);
1699 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1701 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1702 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1707 ip_vs_stats.outbps);
1708 spin_unlock_bh(&ip_vs_stats.lock);
1713 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1715 return single_open(file, ip_vs_stats_show, NULL);
1718 static struct file_operations ip_vs_stats_fops = {
1719 .owner = THIS_MODULE,
1720 .open = ip_vs_stats_seq_open,
1722 .llseek = seq_lseek,
1723 .release = single_release,
1729 * Set timeout values for tcp tcpfin udp in the timeout_table.
1731 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1733 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1738 #ifdef CONFIG_IP_VS_PROTO_TCP
1739 if (u->tcp_timeout) {
1740 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1741 = u->tcp_timeout * HZ;
1744 if (u->tcp_fin_timeout) {
1745 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1746 = u->tcp_fin_timeout * HZ;
1750 #ifdef CONFIG_IP_VS_PROTO_UDP
1751 if (u->udp_timeout) {
1752 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1753 = u->udp_timeout * HZ;
1760 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1761 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1762 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1763 sizeof(struct ip_vs_dest_user))
1764 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1765 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1766 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1768 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1769 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1770 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1771 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1772 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1773 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1774 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1775 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1776 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1777 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1778 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1779 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1783 do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
1786 unsigned char arg[MAX_ARG_LEN];
1787 struct ip_vs_service_user *usvc;
1788 struct ip_vs_service *svc;
1789 struct ip_vs_dest_user *udest;
1791 if (!capable(CAP_NET_ADMIN))
1794 if (len != set_arglen[SET_CMDID(cmd)]) {
1795 IP_VS_ERR("set_ctl: len %u != %u\n",
1796 len, set_arglen[SET_CMDID(cmd)]);
1800 if (copy_from_user(arg, user, len) != 0)
1803 /* increase the module use count */
1804 ip_vs_use_count_inc();
1806 if (down_interruptible(&__ip_vs_mutex)) {
1811 if (cmd == IP_VS_SO_SET_FLUSH) {
1812 /* Flush the virtual service */
1813 ret = ip_vs_flush();
1815 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1816 /* Set timeout values for (tcp tcpfin udp) */
1817 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1819 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1820 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1821 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1823 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1824 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1825 ret = stop_sync_thread(dm->state);
1829 usvc = (struct ip_vs_service_user *)arg;
1830 udest = (struct ip_vs_dest_user *)(usvc + 1);
1832 if (cmd == IP_VS_SO_SET_ZERO) {
1833 /* if no service address is set, zero counters in all */
1834 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1835 ret = ip_vs_zero_all();
1840 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1841 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1842 IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
1843 ntohs(usvc->protocol), NIPQUAD(usvc->addr),
1844 ntohs(usvc->port), usvc->sched_name);
1849 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1850 if (usvc->fwmark == 0)
1851 svc = __ip_vs_service_get(usvc->protocol,
1852 usvc->addr, usvc->port);
1854 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1856 if (cmd != IP_VS_SO_SET_ADD
1857 && (svc == NULL || svc->protocol != usvc->protocol)) {
1863 case IP_VS_SO_SET_ADD:
1867 ret = ip_vs_add_service(usvc, &svc);
1869 case IP_VS_SO_SET_EDIT:
1870 ret = ip_vs_edit_service(svc, usvc);
1872 case IP_VS_SO_SET_DEL:
1873 ret = ip_vs_del_service(svc);
1877 case IP_VS_SO_SET_ZERO:
1878 ret = ip_vs_zero_service(svc);
1880 case IP_VS_SO_SET_ADDDEST:
1881 ret = ip_vs_add_dest(svc, udest);
1883 case IP_VS_SO_SET_EDITDEST:
1884 ret = ip_vs_edit_dest(svc, udest);
1886 case IP_VS_SO_SET_DELDEST:
1887 ret = ip_vs_del_dest(svc, udest);
1894 ip_vs_service_put(svc);
1899 /* decrease the module use count */
1900 ip_vs_use_count_dec();
1907 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
1909 spin_lock_bh(&src->lock);
1910 memcpy(dst, src, (char*)&src->lock - (char*)src);
1911 spin_unlock_bh(&src->lock);
1915 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
1917 dst->protocol = src->protocol;
1918 dst->addr = src->addr;
1919 dst->port = src->port;
1920 dst->fwmark = src->fwmark;
1921 strcpy(dst->sched_name, src->scheduler->name);
1922 dst->flags = src->flags;
1923 dst->timeout = src->timeout / HZ;
1924 dst->netmask = src->netmask;
1925 dst->num_dests = src->num_dests;
1926 ip_vs_copy_stats(&dst->stats, &src->stats);
1930 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
1931 struct ip_vs_get_services __user *uptr)
1934 struct ip_vs_service *svc;
1935 struct ip_vs_service_entry entry;
1938 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1939 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1940 if (count >= get->num_services)
1942 ip_vs_copy_service(&entry, svc);
1943 if (copy_to_user(&uptr->entrytable[count],
1944 &entry, sizeof(entry))) {
1952 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1953 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1954 if (count >= get->num_services)
1956 ip_vs_copy_service(&entry, svc);
1957 if (copy_to_user(&uptr->entrytable[count],
1958 &entry, sizeof(entry))) {
1970 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
1971 struct ip_vs_get_dests __user *uptr)
1973 struct ip_vs_service *svc;
1977 svc = __ip_vs_svc_fwm_get(get->fwmark);
1979 svc = __ip_vs_service_get(get->protocol,
1980 get->addr, get->port);
1983 struct ip_vs_dest *dest;
1984 struct ip_vs_dest_entry entry;
1986 list_for_each_entry(dest, &svc->destinations, n_list) {
1987 if (count >= get->num_dests)
1990 entry.addr = dest->addr;
1991 entry.port = dest->port;
1992 entry.conn_flags = atomic_read(&dest->conn_flags);
1993 entry.weight = atomic_read(&dest->weight);
1994 entry.u_threshold = dest->u_threshold;
1995 entry.l_threshold = dest->l_threshold;
1996 entry.activeconns = atomic_read(&dest->activeconns);
1997 entry.inactconns = atomic_read(&dest->inactconns);
1998 entry.persistconns = atomic_read(&dest->persistconns);
1999 ip_vs_copy_stats(&entry.stats, &dest->stats);
2000 if (copy_to_user(&uptr->entrytable[count],
2001 &entry, sizeof(entry))) {
2007 ip_vs_service_put(svc);
2014 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2016 #ifdef CONFIG_IP_VS_PROTO_TCP
2018 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2019 u->tcp_fin_timeout =
2020 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2022 #ifdef CONFIG_IP_VS_PROTO_UDP
2024 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2029 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2030 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2031 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2032 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2033 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2034 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2035 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2037 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2038 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2039 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2040 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2041 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2042 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2043 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2044 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2048 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2050 unsigned char arg[128];
2053 if (!capable(CAP_NET_ADMIN))
2056 if (*len < get_arglen[GET_CMDID(cmd)]) {
2057 IP_VS_ERR("get_ctl: len %u < %u\n",
2058 *len, get_arglen[GET_CMDID(cmd)]);
2062 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2065 if (down_interruptible(&__ip_vs_mutex))
2066 return -ERESTARTSYS;
2069 case IP_VS_SO_GET_VERSION:
2073 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2074 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2075 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2079 *len = strlen(buf)+1;
2083 case IP_VS_SO_GET_INFO:
2085 struct ip_vs_getinfo info;
2086 info.version = IP_VS_VERSION_CODE;
2087 info.size = IP_VS_CONN_TAB_SIZE;
2088 info.num_services = ip_vs_num_services;
2089 if (copy_to_user(user, &info, sizeof(info)) != 0)
2094 case IP_VS_SO_GET_SERVICES:
2096 struct ip_vs_get_services *get;
2099 get = (struct ip_vs_get_services *)arg;
2100 size = sizeof(*get) +
2101 sizeof(struct ip_vs_service_entry) * get->num_services;
2103 IP_VS_ERR("length: %u != %u\n", *len, size);
2107 ret = __ip_vs_get_service_entries(get, user);
2111 case IP_VS_SO_GET_SERVICE:
2113 struct ip_vs_service_entry *entry;
2114 struct ip_vs_service *svc;
2116 entry = (struct ip_vs_service_entry *)arg;
2118 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2120 svc = __ip_vs_service_get(entry->protocol,
2121 entry->addr, entry->port);
2123 ip_vs_copy_service(entry, svc);
2124 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2126 ip_vs_service_put(svc);
2132 case IP_VS_SO_GET_DESTS:
2134 struct ip_vs_get_dests *get;
2137 get = (struct ip_vs_get_dests *)arg;
2138 size = sizeof(*get) +
2139 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2141 IP_VS_ERR("length: %u != %u\n", *len, size);
2145 ret = __ip_vs_get_dest_entries(get, user);
2149 case IP_VS_SO_GET_TIMEOUT:
2151 struct ip_vs_timeout_user t;
2153 __ip_vs_get_timeouts(&t);
2154 if (copy_to_user(user, &t, sizeof(t)) != 0)
2159 case IP_VS_SO_GET_DAEMON:
2161 struct ip_vs_daemon_user d[2];
2163 memset(&d, 0, sizeof(d));
2164 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2165 d[0].state = IP_VS_STATE_MASTER;
2166 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2167 d[0].syncid = ip_vs_master_syncid;
2169 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2170 d[1].state = IP_VS_STATE_BACKUP;
2171 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2172 d[1].syncid = ip_vs_backup_syncid;
2174 if (copy_to_user(user, &d, sizeof(d)) != 0)
2189 static struct nf_sockopt_ops ip_vs_sockopts = {
2190 { NULL, NULL }, PF_INET,
2191 IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl,
2192 IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl
2196 int ip_vs_control_init(void)
2203 ret = nf_register_sockopt(&ip_vs_sockopts);
2205 IP_VS_ERR("cannot register sockopt.\n");
2209 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2210 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2212 ipv4_vs_table.sysctl_header =
2213 register_sysctl_table(ipv4_vs_table.root_dir, 0);
2215 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2216 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2217 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2218 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2220 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2221 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2224 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2225 ip_vs_stats.lock = SPIN_LOCK_UNLOCKED;
2226 ip_vs_new_estimator(&ip_vs_stats);
2228 /* Hook the defense timer */
2229 init_timer(&defense_timer);
2230 defense_timer.function = defense_timer_handler;
2231 defense_timer.expires = jiffies + DEFENSE_TIMER_PERIOD;
2232 add_timer(&defense_timer);
2239 void ip_vs_control_cleanup(void)
2242 ip_vs_trash_cleanup();
2243 del_timer_sync(&defense_timer);
2244 ip_vs_kill_estimator(&ip_vs_stats);
2245 unregister_sysctl_table(ipv4_vs_table.sysctl_header);
2246 proc_net_remove("ip_vs_stats");
2247 proc_net_remove("ip_vs");
2248 nf_unregister_sockopt(&ip_vs_sockopts);