net: Compute protocol sequence numbers and fragment IDs using MD5, CVE-2011-3188
[linux-flexiantxendom0-natty.git] / net / ipv4 / route.c
index 66610ea..ca25fe7 100644 (file)
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
+#include <net/secure_seq.h>
 
 #define RT_FL_TOS(oldflp) \
     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
@@ -139,6 +140,8 @@ static unsigned long expires_ljiffies;
  */
 
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int     ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int     ipv4_default_mtu(const struct dst_entry *dst);
 static void             ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void             ipv4_link_failure(struct sk_buff *skb);
@@ -155,6 +158,8 @@ static struct dst_ops ipv4_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   rt_garbage_collect,
        .check =                ipv4_dst_check,
+       .default_advmss =       ipv4_default_advmss,
+       .default_mtu =          ipv4_default_mtu,
        .destroy =              ipv4_dst_destroy,
        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
@@ -167,7 +172,7 @@ static struct dst_ops ipv4_dst_ops = {
 
 const __u8 ip_tos2prio[16] = {
        TC_PRIO_BESTEFFORT,
-       ECN_OR_COST(FILLER),
+       ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BULK,
@@ -383,8 +388,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
                        (__force u32)r->rt_gateway,
                        r->rt_flags, atomic_read(&r->dst.__refcnt),
                        r->dst.__use, 0, (__force u32)r->rt_src,
-                       (dst_metric(&r->dst, RTAX_ADVMSS) ?
-                            (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
+                       dst_metric_advmss(&r->dst) + 40,
                        dst_metric(&r->dst, RTAX_WINDOW),
                        (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
                              dst_metric(&r->dst, RTAX_RTTVAR)),
@@ -684,17 +688,17 @@ static inline bool rt_caching(const struct net *net)
 static inline bool compare_hash_inputs(const struct flowi *fl1,
                                        const struct flowi *fl2)
 {
-       return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
-               ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
+       return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
+               ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
                (fl1->iif ^ fl2->iif)) == 0);
 }
 
 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 {
-       return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
-               ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
+       return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
+               ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
                (fl1->mark ^ fl2->mark) |
-               (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
+               (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
                (fl1->oif ^ fl2->oif) |
                (fl1->iif ^ fl2->iif)) == 0;
 }
@@ -714,13 +718,15 @@ static inline int rt_is_expired(struct rtable *rth)
  * Can be called by a softirq or a process.
  * In the later case, we want to be reschedule if necessary
  */
-static void rt_do_flush(int process_context)
+static void rt_do_flush(struct net *net, int process_context)
 {
        unsigned int i;
        struct rtable *rth, *next;
-       struct rtable * tail;
 
        for (i = 0; i <= rt_hash_mask; i++) {
+               struct rtable __rcu **pprev;
+               struct rtable *list;
+
                if (process_context && need_resched())
                        cond_resched();
                rth = rcu_dereference_raw(rt_hash_table[i].chain);
@@ -728,50 +734,32 @@ static void rt_do_flush(int process_context)
                        continue;
 
                spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
-               {
-               struct rtable __rcu **prev;
-               struct rtable *p;
 
-               rth = rcu_dereference_protected(rt_hash_table[i].chain,
+               list = NULL;
+               pprev = &rt_hash_table[i].chain;
+               rth = rcu_dereference_protected(*pprev,
                        lockdep_is_held(rt_hash_lock_addr(i)));
 
-               /* defer releasing the head of the list after spin_unlock */
-               for (tail = rth; tail;
-                    tail = rcu_dereference_protected(tail->dst.rt_next,
-                               lockdep_is_held(rt_hash_lock_addr(i))))
-                       if (!rt_is_expired(tail))
-                               break;
-               if (rth != tail)
-                       rt_hash_table[i].chain = tail;
-
-               /* call rt_free on entries after the tail requiring flush */
-               prev = &rt_hash_table[i].chain;
-               for (p = rcu_dereference_protected(*prev,
-                               lockdep_is_held(rt_hash_lock_addr(i)));
-                    p != NULL;
-                    p = next) {
-                       next = rcu_dereference_protected(p->dst.rt_next,
+               while (rth) {
+                       next = rcu_dereference_protected(rth->dst.rt_next,
                                lockdep_is_held(rt_hash_lock_addr(i)));
-                       if (!rt_is_expired(p)) {
-                               prev = &p->dst.rt_next;
+
+                       if (!net ||
+                           net_eq(dev_net(rth->dst.dev), net)) {
+                               rcu_assign_pointer(*pprev, next);
+                               rcu_assign_pointer(rth->dst.rt_next, list);
+                               list = rth;
                        } else {
-                               *prev = next;
-                               rt_free(p);
+                               pprev = &rth->dst.rt_next;
                        }
+                       rth = next;
                }
-               }
-#else
-               rth = rcu_dereference_protected(rt_hash_table[i].chain,
-                       lockdep_is_held(rt_hash_lock_addr(i)));
-               rcu_assign_pointer(rt_hash_table[i].chain, NULL);
-               tail = NULL;
-#endif
+
                spin_unlock_bh(rt_hash_lock_addr(i));
 
-               for (; rth != tail; rth = next) {
-                       next = rcu_dereference_protected(rth->dst.rt_next, 1);
-                       rt_free(rth);
+               for (; list; list = next) {
+                       next = rcu_dereference_protected(list->dst.rt_next, 1);
+                       rt_free(list);
                }
        }
 }
@@ -919,13 +907,13 @@ void rt_cache_flush(struct net *net, int delay)
 {
        rt_cache_invalidate(net);
        if (delay >= 0)
-               rt_do_flush(!in_softirq());
+               rt_do_flush(net, !in_softirq());
 }
 
 /* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+void rt_cache_flush_batch(struct net *net)
 {
-       rt_do_flush(!in_softirq());
+       rt_do_flush(net, !in_softirq());
 }
 
 static void rt_emergency_hash_rebuild(struct net *net)
@@ -1289,7 +1277,7 @@ void rt_bind_peer(struct rtable *rt, int create)
 {
        struct inet_peer *peer;
 
-       peer = inet_getpeer(rt->rt_dst, create);
+       peer = inet_getpeer_v4(rt->rt_dst, create);
 
        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
                inet_putpeer(peer);
@@ -1686,11 +1674,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
                                        if (mtu < dst_mtu(&rth->dst)) {
                                                dst_confirm(&rth->dst);
                                                if (mtu < ip_rt_min_pmtu) {
+                                                       u32 lock = dst_metric(&rth->dst,
+                                                                             RTAX_LOCK);
                                                        mtu = ip_rt_min_pmtu;
-                                                       rth->dst.metrics[RTAX_LOCK-1] |=
-                                                               (1 << RTAX_MTU);
+                                                       lock |= (1 << RTAX_MTU);
+                                                       dst_metric_set(&rth->dst, RTAX_LOCK,
+                                                                      lock);
                                                }
-                                               rth->dst.metrics[RTAX_MTU-1] = mtu;
+                                               dst_metric_set(&rth->dst, RTAX_MTU, mtu);
                                                dst_set_expires(&rth->dst,
                                                        ip_rt_mtu_expires);
                                        }
@@ -1708,10 +1699,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
        if (dst_mtu(dst) > mtu && mtu >= 68 &&
            !(dst_metric_locked(dst, RTAX_MTU))) {
                if (mtu < ip_rt_min_pmtu) {
+                       u32 lock = dst_metric(dst, RTAX_LOCK);
                        mtu = ip_rt_min_pmtu;
-                       dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+                       dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
                }
-               dst->metrics[RTAX_MTU-1] = mtu;
+               dst_metric_set(dst, RTAX_MTU, mtu);
                dst_set_expires(dst, ip_rt_mtu_expires);
                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
        }
@@ -1794,38 +1786,55 @@ static void set_class_tag(struct rtable *rt, u32 tag)
 }
 #endif
 
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
+{
+       unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+
+       if (advmss == 0) {
+               advmss = max_t(unsigned int, dst->dev->mtu - 40,
+                              ip_rt_min_advmss);
+               if (advmss > 65535 - 40)
+                       advmss = 65535 - 40;
+       }
+       return advmss;
+}
+
+static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
+{
+       unsigned int mtu = dst->dev->mtu;
+
+       if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+               const struct rtable *rt = (const struct rtable *) dst;
+
+               if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+                       mtu = 576;
+       }
+
+       if (mtu > IP_MAX_MTU)
+               mtu = IP_MAX_MTU;
+
+       return mtu;
+}
+
 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 {
+       struct dst_entry *dst = &rt->dst;
        struct fib_info *fi = res->fi;
 
        if (fi) {
                if (FIB_RES_GW(*res) &&
                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
                        rt->rt_gateway = FIB_RES_GW(*res);
-               memcpy(rt->dst.metrics, fi->fib_metrics,
-                      sizeof(rt->dst.metrics));
-               if (fi->fib_mtu == 0) {
-                       rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
-                       if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
-                           rt->rt_gateway != rt->rt_dst &&
-                           rt->dst.dev->mtu > 576)
-                               rt->dst.metrics[RTAX_MTU-1] = 576;
-               }
+               dst_import_metrics(dst, fi->fib_metrics);
 #ifdef CONFIG_NET_CLS_ROUTE
-               rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+               dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
-       } else
-               rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
-
-       if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
-               rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
-       if (dst_mtu(&rt->dst) > IP_MAX_MTU)
-               rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
-       if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
-               rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
-                                      ip_rt_min_advmss);
-       if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
-               rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+       }
+
+       if (dst_mtu(dst) > IP_MAX_MTU)
+               dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
+       if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
+               dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 
 #ifdef CONFIG_NET_CLS_ROUTE
 #ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -2089,12 +2098,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        struct fib_result res;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
-       struct flowi fl = { .nl_u = { .ip4_u =
-                                     { .daddr = daddr,
-                                       .saddr = saddr,
-                                       .tos = tos,
-                                       .scope = RT_SCOPE_UNIVERSE,
-                                     } },
+       struct flowi fl = { .fl4_dst    = daddr,
+                           .fl4_src    = saddr,
+                           .fl4_tos    = tos,
+                           .fl4_scope  = RT_SCOPE_UNIVERSE,
                            .mark = skb->mark,
                            .iif = dev->ifindex };
        unsigned        flags = 0;
@@ -2480,14 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                                const struct flowi *oldflp)
 {
        u32 tos = RT_FL_TOS(oldflp);
-       struct flowi fl = { .nl_u = { .ip4_u =
-                                     { .daddr = oldflp->fl4_dst,
-                                       .saddr = oldflp->fl4_src,
-                                       .tos = tos & IPTOS_RT_MASK,
-                                       .scope = ((tos & RTO_ONLINK) ?
-                                                 RT_SCOPE_LINK :
-                                                 RT_SCOPE_UNIVERSE),
-                                     } },
+       struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
+                           .fl4_src = oldflp->fl4_src,
+                           .fl4_tos = tos & IPTOS_RT_MASK,
+                           .fl4_scope = ((tos & RTO_ONLINK) ?
+                                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
                            .mark = oldflp->mark,
                            .iif = net->loopback_dev->ifindex,
                            .oif = oldflp->oif };
@@ -2559,9 +2563,10 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                        goto out;
 
                /* RACE: Check return value of inet_select_addr instead. */
-               if (rcu_dereference(dev_out->ip_ptr) == NULL)
-                       goto out;       /* Wrong error code */
-
+               if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+                       err = -ENETUNREACH;
+                       goto out;
+               }
                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
                    ipv4_is_lbcast(oldflp->fl4_dst)) {
                        if (!fl.fl4_src)
@@ -2622,8 +2627,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
        }
 
        if (res.type == RTN_LOCAL) {
-               if (!fl.fl4_src)
-                       fl.fl4_src = fl.fl4_dst;
+               if (!fl.fl4_src) {
+                       if (res.fi->fib_prefsrc)
+                               fl.fl4_src = res.fi->fib_prefsrc;
+                       else
+                               fl.fl4_src = fl.fl4_dst;
+               }
                dev_out = net->loopback_dev;
                fl.oif = dev_out->ifindex;
                res.fi = NULL;
@@ -2699,6 +2708,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
        return NULL;
 }
 
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+{
+       return 0;
+}
+
 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
@@ -2708,6 +2722,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
        .protocol               =       cpu_to_be16(ETH_P_IP),
        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
+       .default_mtu            =       ipv4_blackhole_default_mtu,
+       .default_advmss         =       ipv4_default_advmss,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
 };
 
@@ -2725,7 +2741,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard;
-               memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
+               dst_copy_metrics(new, &ort->dst);
 
                new->dev = ort->dst.dev;
                if (new->dev)
@@ -2832,7 +2848,7 @@ static int rt_fill_info(struct net *net,
        if (rt->rt_dst != rt->rt_gateway)
                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
 
-       if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
+       if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
                goto nla_put_failure;
 
        if (rt->fl.mark)
@@ -2944,13 +2960,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
                        err = -rt->dst.error;
        } else {
                struct flowi fl = {
-                       .nl_u = {
-                               .ip4_u = {
-                                       .daddr = dst,
-                                       .saddr = src,
-                                       .tos = rtm->rtm_tos,
-                               },
-                       },
+                       .fl4_dst = dst,
+                       .fl4_src = src,
+                       .fl4_tos = rtm->rtm_tos,
                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
                        .mark = mark,
                };