Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-flexiantxendom0-natty.git] / net / ipv4 / udp.c
index 0fa9f70..d5e75e9 100644 (file)
 #include <net/xfrm.h>
 #include "udp_impl.h"
 
-struct udp_table udp_table;
+struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
 
 int sysctl_udp_mem[3] __read_mostly;
@@ -121,14 +121,16 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
 atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
-#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
                               const struct udp_hslot *hslot,
                               unsigned long *bitmap,
                               struct sock *sk,
                               int (*saddr_comp)(const struct sock *sk1,
-                                                const struct sock *sk2))
+                                                const struct sock *sk2),
+                              unsigned int log)
 {
        struct sock *sk2;
        struct hlist_nulls_node *node;
@@ -142,8 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
                        || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
                    (*saddr_comp)(sk, sk2)) {
                        if (bitmap)
-                               __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
-                                         bitmap);
+                               __set_bit(sk2->sk_hash >> log, bitmap);
                        else
                                return 1;
                }
@@ -180,13 +181,15 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
                /*
                 * force rand to be an odd multiple of UDP_HTABLE_SIZE
                 */
-               rand = (rand | 1) * UDP_HTABLE_SIZE;
-               for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
-                       hslot = &udptable->hash[udp_hashfn(net, first)];
+               rand = (rand | 1) * (udptable->mask + 1);
+               for (last = first + udptable->mask + 1;
+                    first != last;
+                    first++) {
+                       hslot = udp_hashslot(udptable, net, first);
                        bitmap_zero(bitmap, PORTS_PER_CHAIN);
                        spin_lock_bh(&hslot->lock);
                        udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
-                                           saddr_comp);
+                                           saddr_comp, udptable->log);
 
                        snum = first;
                        /*
@@ -196,7 +199,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
                         */
                        do {
                                if (low <= snum && snum <= high &&
-                                   !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
+                                   !test_bit(snum >> udptable->log, bitmap))
                                        goto found;
                                snum += rand;
                        } while (snum != first);
@@ -204,13 +207,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
                }
                goto fail;
        } else {
-               hslot = &udptable->hash[udp_hashfn(net, snum)];
+               hslot = udp_hashslot(udptable, net, snum);
                spin_lock_bh(&hslot->lock);
-               if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
+               if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+                                       saddr_comp, 0))
                        goto fail_unlock;
        }
 found:
-       inet_sk(sk)->num = snum;
+       inet_sk(sk)->inet_num = snum;
        sk->sk_hash = snum;
        if (sk_unhashed(sk)) {
                sk_nulls_add_node_rcu(sk, &hslot->head);
@@ -229,8 +233,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
        struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 
        return  (!ipv6_only_sock(sk2)  &&
-                (!inet1->rcv_saddr || !inet2->rcv_saddr ||
-                  inet1->rcv_saddr == inet2->rcv_saddr));
+                (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+                  inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
 }
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
@@ -249,18 +253,18 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
                struct inet_sock *inet = inet_sk(sk);
 
                score = (sk->sk_family == PF_INET ? 1 : 0);
-               if (inet->rcv_saddr) {
-                       if (inet->rcv_saddr != daddr)
+               if (inet->inet_rcv_saddr) {
+                       if (inet->inet_rcv_saddr != daddr)
                                return -1;
                        score += 2;
                }
-               if (inet->daddr) {
-                       if (inet->daddr != saddr)
+               if (inet->inet_daddr) {
+                       if (inet->inet_daddr != saddr)
                                return -1;
                        score += 2;
                }
-               if (inet->dport) {
-                       if (inet->dport != sport)
+               if (inet->inet_dport) {
+                       if (inet->inet_dport != sport)
                                return -1;
                        score += 2;
                }
@@ -283,7 +287,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
        struct sock *sk, *result;
        struct hlist_nulls_node *node;
        unsigned short hnum = ntohs(dport);
-       unsigned int hash = udp_hashfn(net, hnum);
+       unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
        struct udp_hslot *hslot = &udptable->hash[hash];
        int score, badness;
 
@@ -356,9 +360,10 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 
                if (!net_eq(sock_net(s), net)                           ||
                    s->sk_hash != hnum                                  ||
-                   (inet->daddr && inet->daddr != rmt_addr)            ||
-                   (inet->dport != rmt_port && inet->dport)            ||
-                   (inet->rcv_saddr && inet->rcv_saddr != loc_addr)    ||
+                   (inet->inet_daddr && inet->inet_daddr != rmt_addr)  ||
+                   (inet->inet_dport != rmt_port && inet->inet_dport)  ||
+                   (inet->inet_rcv_saddr       &&
+                    inet->inet_rcv_saddr != loc_addr)                  ||
                    ipv6_only_sock(s)                                   ||
                    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
                        continue;
@@ -642,14 +647,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;
-               daddr = inet->daddr;
-               dport = inet->dport;
+               daddr = inet->inet_daddr;
+               dport = inet->inet_dport;
                /* Open fast path for connected socket.
                   Route will not be used, if at least one option is set.
                 */
                connected = 1;
        }
-       ipc.addr = inet->saddr;
+       ipc.addr = inet->inet_saddr;
 
        ipc.oif = sk->sk_bound_dev_if;
        err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +709,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                                    .proto = sk->sk_protocol,
                                    .flags = inet_sk_flowi_flags(sk),
                                    .uli_u = { .ports =
-                                              { .sport = inet->sport,
+                                              { .sport = inet->inet_sport,
                                                 .dport = dport } } };
                struct net *net = sock_net(sk);
 
@@ -748,7 +753,7 @@ back_from_confirm:
        inet->cork.fl.fl4_dst = daddr;
        inet->cork.fl.fl_ip_dport = dport;
        inet->cork.fl.fl4_src = saddr;
-       inet->cork.fl.fl_ip_sport = inet->sport;
+       inet->cork.fl.fl_ip_sport = inet->inet_sport;
        up->pending = AF_INET;
 
 do_append_data:
@@ -862,6 +867,7 @@ static unsigned int first_packet_length(struct sock *sk)
                udp_lib_checksum_complete(skb)) {
                UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
                                 IS_UDPLITE(sk));
+               atomic_inc(&sk->sk_drops);
                __skb_unlink(skb, rcvq);
                __skb_queue_tail(&list_kill, skb);
        }
@@ -982,7 +988,7 @@ try_again:
                UDP_INC_STATS_USER(sock_net(sk),
                                UDP_MIB_INDATAGRAMS, is_udplite);
 
-       sock_recv_timestamp(msg, sk, skb);
+       sock_recv_ts_and_drops(msg, sk, skb);
 
        /* Copy the address. */
        if (sin) {
@@ -1023,15 +1029,15 @@ int udp_disconnect(struct sock *sk, int flags)
         */
 
        sk->sk_state = TCP_CLOSE;
-       inet->daddr = 0;
-       inet->dport = 0;
+       inet->inet_daddr = 0;
+       inet->inet_dport = 0;
        sk->sk_bound_dev_if = 0;
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
                inet_reset_saddr(sk);
 
        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
                sk->sk_prot->unhash(sk);
-               inet->sport = 0;
+               inet->inet_sport = 0;
        }
        sk_dst_reset(sk);
        return 0;
@@ -1042,12 +1048,12 @@ void udp_lib_unhash(struct sock *sk)
 {
        if (sk_hashed(sk)) {
                struct udp_table *udptable = sk->sk_prot->h.udp_table;
-               unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
-               struct udp_hslot *hslot = &udptable->hash[hash];
+               struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
+                                                    sk->sk_hash);
 
                spin_lock_bh(&hslot->lock);
                if (sk_nulls_del_node_init_rcu(sk)) {
-                       inet_sk(sk)->num = 0;
+                       inet_sk(sk)->inet_num = 0;
                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                }
                spin_unlock_bh(&hslot->lock);
@@ -1057,25 +1063,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
 
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-       int is_udplite = IS_UDPLITE(sk);
-       int rc;
+       int rc = sock_queue_rcv_skb(sk, skb);
+
+       if (rc < 0) {
+               int is_udplite = IS_UDPLITE(sk);
 
-       if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
                /* Note that an ENOMEM error is charged twice */
-               if (rc == -ENOMEM) {
+               if (rc == -ENOMEM)
                        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
                                         is_udplite);
-                       atomic_inc(&sk->sk_drops);
-               }
-               goto drop;
+               UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+               kfree_skb(skb);
+               return -1;
        }
 
        return 0;
 
-drop:
-       UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-       kfree_skb(skb);
-       return -1;
 }
 
 /* returns:
@@ -1182,6 +1185,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 drop:
        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+       atomic_inc(&sk->sk_drops);
        kfree_skb(skb);
        return -1;
 }
@@ -1198,7 +1202,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
                                    struct udp_table *udptable)
 {
        struct sock *sk;
-       struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
+       struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
        int dif;
 
        spin_lock(&hslot->lock);
@@ -1620,9 +1624,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
        struct udp_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);
 
-       for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+       for (state->bucket = start; state->bucket <= state->udp_table->mask;
+            ++state->bucket) {
                struct hlist_nulls_node *node;
                struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+               if (hlist_nulls_empty(&hslot->head))
+                       continue;
+
                spin_lock_bh(&hslot->lock);
                sk_nulls_for_each(sk, node, &hslot->head) {
                        if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1656,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
        } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
        if (!sk) {
-               if (state->bucket < UDP_HTABLE_SIZE)
+               if (state->bucket <= state->udp_table->mask)
                        spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
                return udp_get_first(seq, state->bucket + 1);
        }
@@ -1667,7 +1676,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct udp_iter_state *state = seq->private;
-       state->bucket = UDP_HTABLE_SIZE;
+       state->bucket = MAX_UDP_PORTS;
 
        return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
@@ -1689,7 +1698,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
 {
        struct udp_iter_state *state = seq->private;
 
-       if (state->bucket < UDP_HTABLE_SIZE)
+       if (state->bucket <= state->udp_table->mask)
                spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 }
 
@@ -1744,12 +1753,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                int bucket, int *len)
 {
        struct inet_sock *inet = inet_sk(sp);
-       __be32 dest = inet->daddr;
-       __be32 src  = inet->rcv_saddr;
-       __u16 destp       = ntohs(inet->dport);
-       __u16 srcp        = ntohs(inet->sport);
+       __be32 dest = inet->inet_daddr;
+       __be32 src  = inet->inet_rcv_saddr;
+       __u16 destp       = ntohs(inet->inet_dport);
+       __u16 srcp        = ntohs(inet->inet_sport);
 
-       seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+       seq_printf(f, "%5d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
@@ -1815,11 +1824,43 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-void __init udp_table_init(struct udp_table *table)
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
 {
-       int i;
+       if (!str)
+               return 0;
+       uhash_entries = simple_strtoul(str, &str, 0);
+       if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+               uhash_entries = UDP_HTABLE_SIZE_MIN;
+       return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
 
-       for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+       unsigned int i;
+
+       if (!CONFIG_BASE_SMALL)
+               table->hash = alloc_large_system_hash(name,
+                       sizeof(struct udp_hslot),
+                       uhash_entries,
+                       21, /* one slot per 2 MB */
+                       0,
+                       &table->log,
+                       &table->mask,
+                       64 * 1024);
+       /*
+        * Make sure hash table has the minimum size
+        */
+       if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
+               table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
+                                     sizeof(struct udp_hslot), GFP_KERNEL);
+               if (!table->hash)
+                       panic(name);
+               table->log = ilog2(UDP_HTABLE_SIZE_MIN);
+               table->mask = UDP_HTABLE_SIZE_MIN - 1;
+       }
+       for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
                spin_lock_init(&table->hash[i].lock);
        }
@@ -1829,7 +1870,7 @@ void __init udp_init(void)
 {
        unsigned long nr_pages, limit;
 
-       udp_table_init(&udp_table);
+       udp_table_init(&udp_table, "UDP");
        /* Set the pressure threshold up by the same strategy of TCP. It is a
         * fraction of global memory that is up to 1/2 at 256 MB, decreasing
         * toward zero with the amount of memory, with a floor of 128 pages.