UBUNTU: Ubuntu-2.6.38-12.51
[linux-flexiantxendom0-natty.git] / net / socket.c
index 7f67c07..1204afd 100644 (file)
@@ -156,7 +156,7 @@ static const struct file_operations socket_file_ops = {
  */
 
 static DEFINE_SPINLOCK(net_family_lock);
-static const struct net_proto_family *net_families[NPROTO] __read_mostly;
+static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 
 /*
  *     Statistics counters of the socket lists
@@ -262,6 +262,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 }
 
 
+
 static void wq_free_rcu(struct rcu_head *head)
 {
        struct socket_wq *wq = container_of(head, struct socket_wq, rcu);
@@ -305,22 +306,6 @@ static const struct super_operations sockfs_ops = {
        .statfs         = simple_statfs,
 };
 
-static int sockfs_get_sb(struct file_system_type *fs_type,
-                        int flags, const char *dev_name, void *data,
-                        struct vfsmount *mnt)
-{
-       return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
-                            mnt);
-}
-
-static struct vfsmount *sock_mnt __read_mostly;
-
-static struct file_system_type sock_fs_type = {
-       .name =         "sockfs",
-       .get_sb =       sockfs_get_sb,
-       .kill_sb =      kill_anon_super,
-};
-
 /*
  * sockfs_dname() is called from d_path().
  */
@@ -334,6 +319,21 @@ static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
 };
 
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+                        int flags, const char *dev_name, void *data)
+{
+       return mount_pseudo(fs_type, "socket:", &sockfs_ops,
+               &sockfs_dentry_operations, SOCKFS_MAGIC);
+}
+
+static struct vfsmount *sock_mnt __read_mostly;
+
+static struct file_system_type sock_fs_type = {
+       .name =         "sockfs",
+       .mount =        sockfs_mount,
+       .kill_sb =      kill_anon_super,
+};
+
 /*
  *     Obtains the first available file descriptor and sets it up for use.
  *
@@ -362,14 +362,13 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
        if (unlikely(fd < 0))
                return fd;
 
-       path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
+       path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
        if (unlikely(!path.dentry)) {
                put_unused_fd(fd);
                return -ENOMEM;
        }
        path.mnt = mntget(sock_mnt);
 
-       path.dentry->d_op = &sockfs_dentry_operations;
        d_instantiate(path.dentry, SOCK_INODE(sock));
        SOCK_INODE(sock)->i_fop = &socket_file_ops;
 
@@ -377,7 +376,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
                  &socket_file_ops);
        if (unlikely(!file)) {
                /* drop dentry, keep inode */
-               atomic_inc(&path.dentry->d_inode->i_count);
+               ihold(path.dentry->d_inode);
                path_put(&path);
                put_unused_fd(fd);
                return -ENFILE;
@@ -480,6 +479,7 @@ static struct socket *sock_alloc(void)
        sock = SOCKET_I(inode);
 
        kmemcheck_annotate_bitfield(sock, type);
+       inode->i_ino = get_next_ino();
        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
@@ -733,6 +733,21 @@ static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
        return ret;
 }
 
+/**
+ * kernel_recvmsg - Receive a message from a socket (kernel space)
+ * @sock:       The socket to receive the message from
+ * @msg:        Received message
+ * @vec:        Input s/g array for message data
+ * @num:        Size of input s/g array
+ * @size:       Number of bytes to read
+ * @flags:      Message flags (MSG_DONTWAIT, etc...)
+ *
+ * On return the msg structure contains the scatter/gather array passed in the
+ * vec argument. The array is modified so that it consists of the unfilled
+ * portion of the original array.
+ *
+ * The returned value is the total number of bytes received, or an error.
+ */
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1201,7 +1216,7 @@ int __sock_create(struct net *net, int family, int type, int protocol,
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
-       if (net_families[family] == NULL)
+       if (rcu_access_pointer(net_families[family]) == NULL)
                request_module("net-pf-%d", family);
 #endif
 
@@ -1653,6 +1668,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
        struct iovec iov;
        int fput_needed;
 
+       if (len > INT_MAX)
+               len = INT_MAX;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
@@ -1710,6 +1727,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
        int err, err2;
        int fput_needed;
 
+       if (size > INT_MAX)
+               size = INT_MAX;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
@@ -2098,14 +2117,16 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                 */
                if (MSG_CMSG_COMPAT & flags) {
                        err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry,
-                                           &msg_sys, flags, datagrams);
+                                           &msg_sys, flags & ~MSG_WAITFORONE,
+                                           datagrams);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
-                                           &msg_sys, flags, datagrams);
+                                           &msg_sys, flags & ~MSG_WAITFORONE,
+                                           datagrams);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
@@ -2329,10 +2350,11 @@ int sock_register(const struct net_proto_family *ops)
        }
 
        spin_lock(&net_family_lock);
-       if (net_families[ops->family])
+       if (rcu_dereference_protected(net_families[ops->family],
+                                     lockdep_is_held(&net_family_lock)))
                err = -EEXIST;
        else {
-               net_families[ops->family] = ops;
+               rcu_assign_pointer(net_families[ops->family], ops);
                err = 0;
        }
        spin_unlock(&net_family_lock);
@@ -2360,7 +2382,7 @@ void sock_unregister(int family)
        BUG_ON(family < 0 || family >= NPROTO);
 
        spin_lock(&net_family_lock);
-       net_families[family] = NULL;
+       rcu_assign_pointer(net_families[family], NULL);
        spin_unlock(&net_family_lock);
 
        synchronize_rcu();
@@ -2371,6 +2393,8 @@ EXPORT_SYMBOL(sock_unregister);
 
 static int __init sock_init(void)
 {
+       int err;
+
        /*
         *      Initialize sock SLAB cache.
         */
@@ -2387,8 +2411,15 @@ static int __init sock_init(void)
         */
 
        init_inodecache();
-       register_filesystem(&sock_fs_type);
+
+       err = register_filesystem(&sock_fs_type);
+       if (err)
+               goto out_fs;
        sock_mnt = kern_mount(&sock_fs_type);
+       if (IS_ERR(sock_mnt)) {
+               err = PTR_ERR(sock_mnt);
+               goto out_mount;
+       }
 
        /* The real protocol initialization is performed in later initcalls.
         */
@@ -2401,7 +2432,13 @@ static int __init sock_init(void)
        skb_timestamping_init();
 #endif
 
-       return 0;
+out:
+       return err;
+
+out_mount:
+       unregister_filesystem(&sock_fs_type);
+out_fs:
+       goto out;
 }
 
 core_initcall(sock_init);      /* early initcall */
@@ -2548,23 +2585,123 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 
 static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 {
+       struct compat_ethtool_rxnfc __user *compat_rxnfc;
+       bool convert_in = false, convert_out = false;
+       size_t buf_size = ALIGN(sizeof(struct ifreq), 8);
+       struct ethtool_rxnfc __user *rxnfc;
        struct ifreq __user *ifr;
+       u32 rule_cnt = 0, actual_rule_cnt;
+       u32 ethcmd;
        u32 data;
-       void __user *datap;
+       int ret;
 
-       ifr = compat_alloc_user_space(sizeof(*ifr));
+       if (get_user(data, &ifr32->ifr_ifru.ifru_data))
+               return -EFAULT;
 
-       if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
+       compat_rxnfc = compat_ptr(data);
+
+       if (get_user(ethcmd, &compat_rxnfc->cmd))
                return -EFAULT;
 
-       if (get_user(data, &ifr32->ifr_ifru.ifru_data))
+       /* Most ethtool structures are defined without padding.
+        * Unfortunately struct ethtool_rxnfc is an exception.
+        */
+       switch (ethcmd) {
+       default:
+               break;
+       case ETHTOOL_GRXCLSRLALL:
+               /* Buffer size is variable */
+               if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
+                       return -EFAULT;
+               if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
+                       return -ENOMEM;
+               buf_size += rule_cnt * sizeof(u32);
+               /* fall through */
+       case ETHTOOL_GRXRINGS:
+       case ETHTOOL_GRXCLSRLCNT:
+       case ETHTOOL_GRXCLSRULE:
+               convert_out = true;
+               /* fall through */
+       case ETHTOOL_SRXCLSRLDEL:
+       case ETHTOOL_SRXCLSRLINS:
+               buf_size += sizeof(struct ethtool_rxnfc);
+               convert_in = true;
+               break;
+       }
+
+       ifr = compat_alloc_user_space(buf_size);
+       rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8);
+
+       if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
                return -EFAULT;
 
-       datap = compat_ptr(data);
-       if (put_user(datap, &ifr->ifr_ifru.ifru_data))
+       if (put_user(convert_in ? rxnfc : compat_ptr(data),
+                    &ifr->ifr_ifru.ifru_data))
                return -EFAULT;
 
-       return dev_ioctl(net, SIOCETHTOOL, ifr);
+       if (convert_in) {
+               /* We expect there to be holes between fs.m_u and
+                * fs.ring_cookie and at the end of fs, but nowhere else.
+                */
+               BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) +
+                            sizeof(compat_rxnfc->fs.m_u) !=
+                            offsetof(struct ethtool_rxnfc, fs.m_u) +
+                            sizeof(rxnfc->fs.m_u));
+               BUILD_BUG_ON(
+                       offsetof(struct compat_ethtool_rxnfc, fs.location) -
+                       offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+                       offsetof(struct ethtool_rxnfc, fs.location) -
+                       offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+               if (copy_in_user(rxnfc, compat_rxnfc,
+                                (void *)(&rxnfc->fs.m_u + 1) -
+                                (void *)rxnfc) ||
+                   copy_in_user(&rxnfc->fs.ring_cookie,
+                                &compat_rxnfc->fs.ring_cookie,
+                                (void *)(&rxnfc->fs.location + 1) -
+                                (void *)&rxnfc->fs.ring_cookie) ||
+                   copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt,
+                                sizeof(rxnfc->rule_cnt)))
+                       return -EFAULT;
+       }
+
+       ret = dev_ioctl(net, SIOCETHTOOL, ifr);
+       if (ret)
+               return ret;
+
+       if (convert_out) {
+               if (copy_in_user(compat_rxnfc, rxnfc,
+                                (const void *)(&rxnfc->fs.m_u + 1) -
+                                (const void *)rxnfc) ||
+                   copy_in_user(&compat_rxnfc->fs.ring_cookie,
+                                &rxnfc->fs.ring_cookie,
+                                (const void *)(&rxnfc->fs.location + 1) -
+                                (const void *)&rxnfc->fs.ring_cookie) ||
+                   copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
+                                sizeof(rxnfc->rule_cnt)))
+                       return -EFAULT;
+
+               if (ethcmd == ETHTOOL_GRXCLSRLALL) {
+                       /* As an optimisation, we only copy the actual
+                        * number of rules that the underlying
+                        * function returned.  Since Mallory might
+                        * change the rule count in user memory, we
+                        * check that it is less than the rule count
+                        * originally given (as the user buffer size),
+                        * which has been range-checked.
+                        */
+                       if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
+                               return -EFAULT;
+                       if (actual_rule_cnt < rule_cnt)
+                               rule_cnt = actual_rule_cnt;
+                       if (copy_in_user(&compat_rxnfc->rule_locs[0],
+                                        &rxnfc->rule_locs[0],
+                                        rule_cnt * sizeof(u32)))
+                               return -EFAULT;
+               }
+       }
+
+       return 0;
 }
 
 static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)