openvswitch: Honor dp_ifindex, when specified, for vport lookup by name.
[linux-flexiantxendom0-3.2.10.git] / net / openvswitch / datapath.c
1 /*
2  * Copyright (c) 2007-2012 Nicira Networks.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/init.h>
22 #include <linux/module.h>
23 #include <linux/if_arp.h>
24 #include <linux/if_vlan.h>
25 #include <linux/in.h>
26 #include <linux/ip.h>
27 #include <linux/jhash.h>
28 #include <linux/delay.h>
29 #include <linux/time.h>
30 #include <linux/etherdevice.h>
31 #include <linux/genetlink.h>
32 #include <linux/kernel.h>
33 #include <linux/kthread.h>
34 #include <linux/mutex.h>
35 #include <linux/percpu.h>
36 #include <linux/rcupdate.h>
37 #include <linux/tcp.h>
38 #include <linux/udp.h>
39 #include <linux/ethtool.h>
40 #include <linux/wait.h>
41 #include <asm/system.h>
42 #include <asm/div64.h>
43 #include <linux/highmem.h>
44 #include <linux/netfilter_bridge.h>
45 #include <linux/netfilter_ipv4.h>
46 #include <linux/inetdevice.h>
47 #include <linux/list.h>
48 #include <linux/openvswitch.h>
49 #include <linux/rculist.h>
50 #include <linux/dmi.h>
51 #include <linux/workqueue.h>
52 #include <net/genetlink.h>
53
54 #include "datapath.h"
55 #include "flow.h"
56 #include "vport-internal_dev.h"
57
58 /**
59  * DOC: Locking:
60  *
61  * Writes to device state (add/remove datapath, port, set operations on vports,
62  * etc.) are protected by RTNL.
63  *
64  * Writes to other state (flow table modifications, set miscellaneous datapath
65  * parameters, etc.) are protected by genl_mutex.  The RTNL lock nests inside
66  * genl_mutex.
67  *
68  * Reads are protected by RCU.
69  *
70  * There are a few special cases (mostly stats) that have their own
71  * synchronization but they nest under all of above and don't interact with
72  * each other.
73  */
74
75 /* Global list of datapaths to enable dumping them all out.
76  * Protected by genl_mutex.
77  */
78 static LIST_HEAD(dps);
79
80 #define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
81 static void rehash_flow_table(struct work_struct *work);
82 static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
83
84 static struct vport *new_vport(const struct vport_parms *);
85 static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
86                              const struct dp_upcall_info *);
87 static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
88                                   const struct dp_upcall_info *);
89
90 /* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
91 static struct datapath *get_dp(int dp_ifindex)
92 {
93         struct datapath *dp = NULL;
94         struct net_device *dev;
95
96         rcu_read_lock();
97         dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
98         if (dev) {
99                 struct vport *vport = ovs_internal_dev_get_vport(dev);
100                 if (vport)
101                         dp = vport->dp;
102         }
103         rcu_read_unlock();
104
105         return dp;
106 }
107
108 /* Must be called with rcu_read_lock or RTNL lock. */
109 const char *ovs_dp_name(const struct datapath *dp)
110 {
111         struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
112         return vport->ops->get_name(vport);
113 }
114
115 static int get_dpifindex(struct datapath *dp)
116 {
117         struct vport *local;
118         int ifindex;
119
120         rcu_read_lock();
121
122         local = rcu_dereference(dp->ports[OVSP_LOCAL]);
123         if (local)
124                 ifindex = local->ops->get_ifindex(local);
125         else
126                 ifindex = 0;
127
128         rcu_read_unlock();
129
130         return ifindex;
131 }
132
133 static void destroy_dp_rcu(struct rcu_head *rcu)
134 {
135         struct datapath *dp = container_of(rcu, struct datapath, rcu);
136
137         ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
138         free_percpu(dp->stats_percpu);
139         kfree(dp);
140 }
141
142 /* Called with RTNL lock and genl_lock. */
143 static struct vport *new_vport(const struct vport_parms *parms)
144 {
145         struct vport *vport;
146
147         vport = ovs_vport_add(parms);
148         if (!IS_ERR(vport)) {
149                 struct datapath *dp = parms->dp;
150
151                 rcu_assign_pointer(dp->ports[parms->port_no], vport);
152                 list_add(&vport->node, &dp->port_list);
153         }
154
155         return vport;
156 }
157
158 /* Called with RTNL lock. */
159 void ovs_dp_detach_port(struct vport *p)
160 {
161         ASSERT_RTNL();
162
163         /* First drop references to device. */
164         list_del(&p->node);
165         rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
166
167         /* Then destroy it. */
168         ovs_vport_del(p);
169 }
170
171 /* Must be called with rcu_read_lock. */
172 void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
173 {
174         struct datapath *dp = p->dp;
175         struct sw_flow *flow;
176         struct dp_stats_percpu *stats;
177         struct sw_flow_key key;
178         u64 *stats_counter;
179         int error;
180         int key_len;
181
182         stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
183
184         /* Extract flow from 'skb' into 'key'. */
185         error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
186         if (unlikely(error)) {
187                 kfree_skb(skb);
188                 return;
189         }
190
191         /* Look up flow. */
192         flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
193         if (unlikely(!flow)) {
194                 struct dp_upcall_info upcall;
195
196                 upcall.cmd = OVS_PACKET_CMD_MISS;
197                 upcall.key = &key;
198                 upcall.userdata = NULL;
199                 upcall.pid = p->upcall_pid;
200                 ovs_dp_upcall(dp, skb, &upcall);
201                 consume_skb(skb);
202                 stats_counter = &stats->n_missed;
203                 goto out;
204         }
205
206         OVS_CB(skb)->flow = flow;
207
208         stats_counter = &stats->n_hit;
209         ovs_flow_used(OVS_CB(skb)->flow, skb);
210         ovs_execute_actions(dp, skb);
211
212 out:
213         /* Update datapath statistics. */
214         u64_stats_update_begin(&stats->sync);
215         (*stats_counter)++;
216         u64_stats_update_end(&stats->sync);
217 }
218
219 static struct genl_family dp_packet_genl_family = {
220         .id = GENL_ID_GENERATE,
221         .hdrsize = sizeof(struct ovs_header),
222         .name = OVS_PACKET_FAMILY,
223         .version = OVS_PACKET_VERSION,
224         .maxattr = OVS_PACKET_ATTR_MAX
225 };
226
227 int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
228               const struct dp_upcall_info *upcall_info)
229 {
230         struct dp_stats_percpu *stats;
231         int dp_ifindex;
232         int err;
233
234         if (upcall_info->pid == 0) {
235                 err = -ENOTCONN;
236                 goto err;
237         }
238
239         dp_ifindex = get_dpifindex(dp);
240         if (!dp_ifindex) {
241                 err = -ENODEV;
242                 goto err;
243         }
244
245         if (!skb_is_gso(skb))
246                 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
247         else
248                 err = queue_gso_packets(dp_ifindex, skb, upcall_info);
249         if (err)
250                 goto err;
251
252         return 0;
253
254 err:
255         stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
256
257         u64_stats_update_begin(&stats->sync);
258         stats->n_lost++;
259         u64_stats_update_end(&stats->sync);
260
261         return err;
262 }
263
264 static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
265                              const struct dp_upcall_info *upcall_info)
266 {
267         struct dp_upcall_info later_info;
268         struct sw_flow_key later_key;
269         struct sk_buff *segs, *nskb;
270         int err;
271
272         segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
273         if (IS_ERR(skb))
274                 return PTR_ERR(skb);
275
276         /* Queue all of the segments. */
277         skb = segs;
278         do {
279                 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
280                 if (err)
281                         break;
282
283                 if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
284                         /* The initial flow key extracted by ovs_flow_extract()
285                          * in this case is for a first fragment, so we need to
286                          * properly mark later fragments.
287                          */
288                         later_key = *upcall_info->key;
289                         later_key.ip.frag = OVS_FRAG_TYPE_LATER;
290
291                         later_info = *upcall_info;
292                         later_info.key = &later_key;
293                         upcall_info = &later_info;
294                 }
295         } while ((skb = skb->next));
296
297         /* Free all of the segments. */
298         skb = segs;
299         do {
300                 nskb = skb->next;
301                 if (err)
302                         kfree_skb(skb);
303                 else
304                         consume_skb(skb);
305         } while ((skb = nskb));
306         return err;
307 }
308
309 static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
310                                   const struct dp_upcall_info *upcall_info)
311 {
312         struct ovs_header *upcall;
313         struct sk_buff *nskb = NULL;
314         struct sk_buff *user_skb; /* to be queued to userspace */
315         struct nlattr *nla;
316         unsigned int len;
317         int err;
318
319         if (vlan_tx_tag_present(skb)) {
320                 nskb = skb_clone(skb, GFP_ATOMIC);
321                 if (!nskb)
322                         return -ENOMEM;
323
324                 nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
325                 if (!skb)
326                         return -ENOMEM;
327
328                 nskb->vlan_tci = 0;
329                 skb = nskb;
330         }
331
332         if (nla_attr_size(skb->len) > USHRT_MAX) {
333                 err = -EFBIG;
334                 goto out;
335         }
336
337         len = sizeof(struct ovs_header);
338         len += nla_total_size(skb->len);
339         len += nla_total_size(FLOW_BUFSIZE);
340         if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
341                 len += nla_total_size(8);
342
343         user_skb = genlmsg_new(len, GFP_ATOMIC);
344         if (!user_skb) {
345                 err = -ENOMEM;
346                 goto out;
347         }
348
349         upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
350                              0, upcall_info->cmd);
351         upcall->dp_ifindex = dp_ifindex;
352
353         nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
354         ovs_flow_to_nlattrs(upcall_info->key, user_skb);
355         nla_nest_end(user_skb, nla);
356
357         if (upcall_info->userdata)
358                 nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
359                             nla_get_u64(upcall_info->userdata));
360
361         nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
362
363         skb_copy_and_csum_dev(skb, nla_data(nla));
364
365         err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
366
367 out:
368         kfree_skb(nskb);
369         return err;
370 }
371
372 /* Called with genl_mutex. */
373 static int flush_flows(int dp_ifindex)
374 {
375         struct flow_table *old_table;
376         struct flow_table *new_table;
377         struct datapath *dp;
378
379         dp = get_dp(dp_ifindex);
380         if (!dp)
381                 return -ENODEV;
382
383         old_table = genl_dereference(dp->table);
384         new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
385         if (!new_table)
386                 return -ENOMEM;
387
388         rcu_assign_pointer(dp->table, new_table);
389
390         ovs_flow_tbl_deferred_destroy(old_table);
391         return 0;
392 }
393
394 static int validate_actions(const struct nlattr *attr,
395                                 const struct sw_flow_key *key, int depth);
396
397 static int validate_sample(const struct nlattr *attr,
398                                 const struct sw_flow_key *key, int depth)
399 {
400         const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
401         const struct nlattr *probability, *actions;
402         const struct nlattr *a;
403         int rem;
404
405         memset(attrs, 0, sizeof(attrs));
406         nla_for_each_nested(a, attr, rem) {
407                 int type = nla_type(a);
408                 if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
409                         return -EINVAL;
410                 attrs[type] = a;
411         }
412         if (rem)
413                 return -EINVAL;
414
415         probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
416         if (!probability || nla_len(probability) != sizeof(u32))
417                 return -EINVAL;
418
419         actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
420         if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
421                 return -EINVAL;
422         return validate_actions(actions, key, depth + 1);
423 }
424
425 static int validate_set(const struct nlattr *a,
426                         const struct sw_flow_key *flow_key)
427 {
428         const struct nlattr *ovs_key = nla_data(a);
429         int key_type = nla_type(ovs_key);
430
431         /* There can be only one key in a action */
432         if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
433                 return -EINVAL;
434
435         if (key_type > OVS_KEY_ATTR_MAX ||
436             nla_len(ovs_key) != ovs_key_lens[key_type])
437                 return -EINVAL;
438
439         switch (key_type) {
440         const struct ovs_key_ipv4 *ipv4_key;
441
442         case OVS_KEY_ATTR_PRIORITY:
443         case OVS_KEY_ATTR_ETHERNET:
444                 break;
445
446         case OVS_KEY_ATTR_IPV4:
447                 if (flow_key->eth.type != htons(ETH_P_IP))
448                         return -EINVAL;
449
450                 if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
451                         return -EINVAL;
452
453                 ipv4_key = nla_data(ovs_key);
454                 if (ipv4_key->ipv4_proto != flow_key->ip.proto)
455                         return -EINVAL;
456
457                 if (ipv4_key->ipv4_frag != flow_key->ip.frag)
458                         return -EINVAL;
459
460                 break;
461
462         case OVS_KEY_ATTR_TCP:
463                 if (flow_key->ip.proto != IPPROTO_TCP)
464                         return -EINVAL;
465
466                 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
467                         return -EINVAL;
468
469                 break;
470
471         case OVS_KEY_ATTR_UDP:
472                 if (flow_key->ip.proto != IPPROTO_UDP)
473                         return -EINVAL;
474
475                 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
476                         return -EINVAL;
477                 break;
478
479         default:
480                 return -EINVAL;
481         }
482
483         return 0;
484 }
485
486 static int validate_userspace(const struct nlattr *attr)
487 {
488         static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] =   {
489                 [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
490                 [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
491         };
492         struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
493         int error;
494
495         error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
496                                  attr, userspace_policy);
497         if (error)
498                 return error;
499
500         if (!a[OVS_USERSPACE_ATTR_PID] ||
501             !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
502                 return -EINVAL;
503
504         return 0;
505 }
506
507 static int validate_actions(const struct nlattr *attr,
508                                 const struct sw_flow_key *key,  int depth)
509 {
510         const struct nlattr *a;
511         int rem, err;
512
513         if (depth >= SAMPLE_ACTION_DEPTH)
514                 return -EOVERFLOW;
515
516         nla_for_each_nested(a, attr, rem) {
517                 /* Expected argument lengths, (u32)-1 for variable length. */
518                 static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
519                         [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
520                         [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
521                         [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
522                         [OVS_ACTION_ATTR_POP_VLAN] = 0,
523                         [OVS_ACTION_ATTR_SET] = (u32)-1,
524                         [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
525                 };
526                 const struct ovs_action_push_vlan *vlan;
527                 int type = nla_type(a);
528
529                 if (type > OVS_ACTION_ATTR_MAX ||
530                     (action_lens[type] != nla_len(a) &&
531                      action_lens[type] != (u32)-1))
532                         return -EINVAL;
533
534                 switch (type) {
535                 case OVS_ACTION_ATTR_UNSPEC:
536                         return -EINVAL;
537
538                 case OVS_ACTION_ATTR_USERSPACE:
539                         err = validate_userspace(a);
540                         if (err)
541                                 return err;
542                         break;
543
544                 case OVS_ACTION_ATTR_OUTPUT:
545                         if (nla_get_u32(a) >= DP_MAX_PORTS)
546                                 return -EINVAL;
547                         break;
548
549
550                 case OVS_ACTION_ATTR_POP_VLAN:
551                         break;
552
553                 case OVS_ACTION_ATTR_PUSH_VLAN:
554                         vlan = nla_data(a);
555                         if (vlan->vlan_tpid != htons(ETH_P_8021Q))
556                                 return -EINVAL;
557                         if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
558                                 return -EINVAL;
559                         break;
560
561                 case OVS_ACTION_ATTR_SET:
562                         err = validate_set(a, key);
563                         if (err)
564                                 return err;
565                         break;
566
567                 case OVS_ACTION_ATTR_SAMPLE:
568                         err = validate_sample(a, key, depth);
569                         if (err)
570                                 return err;
571                         break;
572
573                 default:
574                         return -EINVAL;
575                 }
576         }
577
578         if (rem > 0)
579                 return -EINVAL;
580
581         return 0;
582 }
583
584 static void clear_stats(struct sw_flow *flow)
585 {
586         flow->used = 0;
587         flow->tcp_flags = 0;
588         flow->packet_count = 0;
589         flow->byte_count = 0;
590 }
591
592 static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
593 {
594         struct ovs_header *ovs_header = info->userhdr;
595         struct nlattr **a = info->attrs;
596         struct sw_flow_actions *acts;
597         struct sk_buff *packet;
598         struct sw_flow *flow;
599         struct datapath *dp;
600         struct ethhdr *eth;
601         int len;
602         int err;
603         int key_len;
604
605         err = -EINVAL;
606         if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
607             !a[OVS_PACKET_ATTR_ACTIONS] ||
608             nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
609                 goto err;
610
611         len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
612         packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
613         err = -ENOMEM;
614         if (!packet)
615                 goto err;
616         skb_reserve(packet, NET_IP_ALIGN);
617
618         memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
619
620         skb_reset_mac_header(packet);
621         eth = eth_hdr(packet);
622
623         /* Normally, setting the skb 'protocol' field would be handled by a
624          * call to eth_type_trans(), but it assumes there's a sending
625          * device, which we may not have. */
626         if (ntohs(eth->h_proto) >= 1536)
627                 packet->protocol = eth->h_proto;
628         else
629                 packet->protocol = htons(ETH_P_802_2);
630
631         /* Build an sw_flow for sending this packet. */
632         flow = ovs_flow_alloc();
633         err = PTR_ERR(flow);
634         if (IS_ERR(flow))
635                 goto err_kfree_skb;
636
637         err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
638         if (err)
639                 goto err_flow_free;
640
641         err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
642                                              &flow->key.phy.in_port,
643                                              a[OVS_PACKET_ATTR_KEY]);
644         if (err)
645                 goto err_flow_free;
646
647         err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
648         if (err)
649                 goto err_flow_free;
650
651         flow->hash = ovs_flow_hash(&flow->key, key_len);
652
653         acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
654         err = PTR_ERR(acts);
655         if (IS_ERR(acts))
656                 goto err_flow_free;
657         rcu_assign_pointer(flow->sf_acts, acts);
658
659         OVS_CB(packet)->flow = flow;
660         packet->priority = flow->key.phy.priority;
661
662         rcu_read_lock();
663         dp = get_dp(ovs_header->dp_ifindex);
664         err = -ENODEV;
665         if (!dp)
666                 goto err_unlock;
667
668         local_bh_disable();
669         err = ovs_execute_actions(dp, packet);
670         local_bh_enable();
671         rcu_read_unlock();
672
673         ovs_flow_free(flow);
674         return err;
675
676 err_unlock:
677         rcu_read_unlock();
678 err_flow_free:
679         ovs_flow_free(flow);
680 err_kfree_skb:
681         kfree_skb(packet);
682 err:
683         return err;
684 }
685
686 static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
687         [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
688         [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
689         [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
690 };
691
692 static struct genl_ops dp_packet_genl_ops[] = {
693         { .cmd = OVS_PACKET_CMD_EXECUTE,
694           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
695           .policy = packet_policy,
696           .doit = ovs_packet_cmd_execute
697         }
698 };
699
700 static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
701 {
702         int i;
703         struct flow_table *table = genl_dereference(dp->table);
704
705         stats->n_flows = ovs_flow_tbl_count(table);
706
707         stats->n_hit = stats->n_missed = stats->n_lost = 0;
708         for_each_possible_cpu(i) {
709                 const struct dp_stats_percpu *percpu_stats;
710                 struct dp_stats_percpu local_stats;
711                 unsigned int start;
712
713                 percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
714
715                 do {
716                         start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
717                         local_stats = *percpu_stats;
718                 } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
719
720                 stats->n_hit += local_stats.n_hit;
721                 stats->n_missed += local_stats.n_missed;
722                 stats->n_lost += local_stats.n_lost;
723         }
724 }
725
726 static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
727         [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
728         [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
729         [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
730 };
731
732 static struct genl_family dp_flow_genl_family = {
733         .id = GENL_ID_GENERATE,
734         .hdrsize = sizeof(struct ovs_header),
735         .name = OVS_FLOW_FAMILY,
736         .version = OVS_FLOW_VERSION,
737         .maxattr = OVS_FLOW_ATTR_MAX
738 };
739
740 static struct genl_multicast_group ovs_dp_flow_multicast_group = {
741         .name = OVS_FLOW_MCGROUP
742 };
743
744 /* Called with genl_lock. */
745 static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
746                                   struct sk_buff *skb, u32 pid,
747                                   u32 seq, u32 flags, u8 cmd)
748 {
749         const int skb_orig_len = skb->len;
750         const struct sw_flow_actions *sf_acts;
751         struct ovs_flow_stats stats;
752         struct ovs_header *ovs_header;
753         struct nlattr *nla;
754         unsigned long used;
755         u8 tcp_flags;
756         int err;
757
758         sf_acts = rcu_dereference_protected(flow->sf_acts,
759                                             lockdep_genl_is_held());
760
761         ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
762         if (!ovs_header)
763                 return -EMSGSIZE;
764
765         ovs_header->dp_ifindex = get_dpifindex(dp);
766
767         nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
768         if (!nla)
769                 goto nla_put_failure;
770         err = ovs_flow_to_nlattrs(&flow->key, skb);
771         if (err)
772                 goto error;
773         nla_nest_end(skb, nla);
774
775         spin_lock_bh(&flow->lock);
776         used = flow->used;
777         stats.n_packets = flow->packet_count;
778         stats.n_bytes = flow->byte_count;
779         tcp_flags = flow->tcp_flags;
780         spin_unlock_bh(&flow->lock);
781
782         if (used)
783                 NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
784
785         if (stats.n_packets)
786                 NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
787                         sizeof(struct ovs_flow_stats), &stats);
788
789         if (tcp_flags)
790                 NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
791
792         /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
793          * this is the first flow to be dumped into 'skb'.  This is unusual for
794          * Netlink but individual action lists can be longer than
795          * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
796          * The userspace caller can always fetch the actions separately if it
797          * really wants them.  (Most userspace callers in fact don't care.)
798          *
799          * This can only fail for dump operations because the skb is always
800          * properly sized for single flows.
801          */
802         err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
803                       sf_acts->actions);
804         if (err < 0 && skb_orig_len)
805                 goto error;
806
807         return genlmsg_end(skb, ovs_header);
808
809 nla_put_failure:
810         err = -EMSGSIZE;
811 error:
812         genlmsg_cancel(skb, ovs_header);
813         return err;
814 }
815
816 static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
817 {
818         const struct sw_flow_actions *sf_acts;
819         int len;
820
821         sf_acts = rcu_dereference_protected(flow->sf_acts,
822                                             lockdep_genl_is_held());
823
824         /* OVS_FLOW_ATTR_KEY */
825         len = nla_total_size(FLOW_BUFSIZE);
826         /* OVS_FLOW_ATTR_ACTIONS */
827         len += nla_total_size(sf_acts->actions_len);
828         /* OVS_FLOW_ATTR_STATS */
829         len += nla_total_size(sizeof(struct ovs_flow_stats));
830         /* OVS_FLOW_ATTR_TCP_FLAGS */
831         len += nla_total_size(1);
832         /* OVS_FLOW_ATTR_USED */
833         len += nla_total_size(8);
834
835         len += NLMSG_ALIGN(sizeof(struct ovs_header));
836
837         return genlmsg_new(len, GFP_KERNEL);
838 }
839
840 static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
841                                                struct datapath *dp,
842                                                u32 pid, u32 seq, u8 cmd)
843 {
844         struct sk_buff *skb;
845         int retval;
846
847         skb = ovs_flow_cmd_alloc_info(flow);
848         if (!skb)
849                 return ERR_PTR(-ENOMEM);
850
851         retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
852         BUG_ON(retval < 0);
853         return skb;
854 }
855
856 static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
857 {
858         struct nlattr **a = info->attrs;
859         struct ovs_header *ovs_header = info->userhdr;
860         struct sw_flow_key key;
861         struct sw_flow *flow;
862         struct sk_buff *reply;
863         struct datapath *dp;
864         struct flow_table *table;
865         int error;
866         int key_len;
867
868         /* Extract key. */
869         error = -EINVAL;
870         if (!a[OVS_FLOW_ATTR_KEY])
871                 goto error;
872         error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
873         if (error)
874                 goto error;
875
876         /* Validate actions. */
877         if (a[OVS_FLOW_ATTR_ACTIONS]) {
878                 error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0);
879                 if (error)
880                         goto error;
881         } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
882                 error = -EINVAL;
883                 goto error;
884         }
885
886         dp = get_dp(ovs_header->dp_ifindex);
887         error = -ENODEV;
888         if (!dp)
889                 goto error;
890
891         table = genl_dereference(dp->table);
892         flow = ovs_flow_tbl_lookup(table, &key, key_len);
893         if (!flow) {
894                 struct sw_flow_actions *acts;
895
896                 /* Bail out if we're not allowed to create a new flow. */
897                 error = -ENOENT;
898                 if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
899                         goto error;
900
901                 /* Expand table, if necessary, to make room. */
902                 if (ovs_flow_tbl_need_to_expand(table)) {
903                         struct flow_table *new_table;
904
905                         new_table = ovs_flow_tbl_expand(table);
906                         if (!IS_ERR(new_table)) {
907                                 rcu_assign_pointer(dp->table, new_table);
908                                 ovs_flow_tbl_deferred_destroy(table);
909                                 table = genl_dereference(dp->table);
910                         }
911                 }
912
913                 /* Allocate flow. */
914                 flow = ovs_flow_alloc();
915                 if (IS_ERR(flow)) {
916                         error = PTR_ERR(flow);
917                         goto error;
918                 }
919                 flow->key = key;
920                 clear_stats(flow);
921
922                 /* Obtain actions. */
923                 acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
924                 error = PTR_ERR(acts);
925                 if (IS_ERR(acts))
926                         goto error_free_flow;
927                 rcu_assign_pointer(flow->sf_acts, acts);
928
929                 /* Put flow in bucket. */
930                 flow->hash = ovs_flow_hash(&key, key_len);
931                 ovs_flow_tbl_insert(table, flow);
932
933                 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
934                                                 info->snd_seq,
935                                                 OVS_FLOW_CMD_NEW);
936         } else {
937                 /* We found a matching flow. */
938                 struct sw_flow_actions *old_acts;
939                 struct nlattr *acts_attrs;
940
941                 /* Bail out if we're not allowed to modify an existing flow.
942                  * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
943                  * because Generic Netlink treats the latter as a dump
944                  * request.  We also accept NLM_F_EXCL in case that bug ever
945                  * gets fixed.
946                  */
947                 error = -EEXIST;
948                 if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
949                     info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
950                         goto error;
951
952                 /* Update actions. */
953                 old_acts = rcu_dereference_protected(flow->sf_acts,
954                                                      lockdep_genl_is_held());
955                 acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
956                 if (acts_attrs &&
957                    (old_acts->actions_len != nla_len(acts_attrs) ||
958                    memcmp(old_acts->actions, nla_data(acts_attrs),
959                           old_acts->actions_len))) {
960                         struct sw_flow_actions *new_acts;
961
962                         new_acts = ovs_flow_actions_alloc(acts_attrs);
963                         error = PTR_ERR(new_acts);
964                         if (IS_ERR(new_acts))
965                                 goto error;
966
967                         rcu_assign_pointer(flow->sf_acts, new_acts);
968                         ovs_flow_deferred_free_acts(old_acts);
969                 }
970
971                 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
972                                                info->snd_seq, OVS_FLOW_CMD_NEW);
973
974                 /* Clear stats. */
975                 if (a[OVS_FLOW_ATTR_CLEAR]) {
976                         spin_lock_bh(&flow->lock);
977                         clear_stats(flow);
978                         spin_unlock_bh(&flow->lock);
979                 }
980         }
981
982         if (!IS_ERR(reply))
983                 genl_notify(reply, genl_info_net(info), info->snd_pid,
984                            ovs_dp_flow_multicast_group.id, info->nlhdr,
985                            GFP_KERNEL);
986         else
987                 netlink_set_err(init_net.genl_sock, 0,
988                                 ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
989         return 0;
990
991 error_free_flow:
992         ovs_flow_free(flow);
993 error:
994         return error;
995 }
996
997 static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
998 {
999         struct nlattr **a = info->attrs;
1000         struct ovs_header *ovs_header = info->userhdr;
1001         struct sw_flow_key key;
1002         struct sk_buff *reply;
1003         struct sw_flow *flow;
1004         struct datapath *dp;
1005         struct flow_table *table;
1006         int err;
1007         int key_len;
1008
1009         if (!a[OVS_FLOW_ATTR_KEY])
1010                 return -EINVAL;
1011         err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1012         if (err)
1013                 return err;
1014
1015         dp = get_dp(ovs_header->dp_ifindex);
1016         if (!dp)
1017                 return -ENODEV;
1018
1019         table = genl_dereference(dp->table);
1020         flow = ovs_flow_tbl_lookup(table, &key, key_len);
1021         if (!flow)
1022                 return -ENOENT;
1023
1024         reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
1025                                         info->snd_seq, OVS_FLOW_CMD_NEW);
1026         if (IS_ERR(reply))
1027                 return PTR_ERR(reply);
1028
1029         return genlmsg_reply(reply, info);
1030 }
1031
1032 static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1033 {
1034         struct nlattr **a = info->attrs;
1035         struct ovs_header *ovs_header = info->userhdr;
1036         struct sw_flow_key key;
1037         struct sk_buff *reply;
1038         struct sw_flow *flow;
1039         struct datapath *dp;
1040         struct flow_table *table;
1041         int err;
1042         int key_len;
1043
1044         if (!a[OVS_FLOW_ATTR_KEY])
1045                 return flush_flows(ovs_header->dp_ifindex);
1046         err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1047         if (err)
1048                 return err;
1049
1050         dp = get_dp(ovs_header->dp_ifindex);
1051         if (!dp)
1052                 return -ENODEV;
1053
1054         table = genl_dereference(dp->table);
1055         flow = ovs_flow_tbl_lookup(table, &key, key_len);
1056         if (!flow)
1057                 return -ENOENT;
1058
1059         reply = ovs_flow_cmd_alloc_info(flow);
1060         if (!reply)
1061                 return -ENOMEM;
1062
1063         ovs_flow_tbl_remove(table, flow);
1064
1065         err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
1066                                      info->snd_seq, 0, OVS_FLOW_CMD_DEL);
1067         BUG_ON(err < 0);
1068
1069         ovs_flow_deferred_free(flow);
1070
1071         genl_notify(reply, genl_info_net(info), info->snd_pid,
1072                     ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
1073         return 0;
1074 }
1075
1076 static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1077 {
1078         struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1079         struct datapath *dp;
1080         struct flow_table *table;
1081
1082         dp = get_dp(ovs_header->dp_ifindex);
1083         if (!dp)
1084                 return -ENODEV;
1085
1086         table = genl_dereference(dp->table);
1087
1088         for (;;) {
1089                 struct sw_flow *flow;
1090                 u32 bucket, obj;
1091
1092                 bucket = cb->args[0];
1093                 obj = cb->args[1];
1094                 flow = ovs_flow_tbl_next(table, &bucket, &obj);
1095                 if (!flow)
1096                         break;
1097
1098                 if (ovs_flow_cmd_fill_info(flow, dp, skb,
1099                                            NETLINK_CB(cb->skb).pid,
1100                                            cb->nlh->nlmsg_seq, NLM_F_MULTI,
1101                                            OVS_FLOW_CMD_NEW) < 0)
1102                         break;
1103
1104                 cb->args[0] = bucket;
1105                 cb->args[1] = obj;
1106         }
1107         return skb->len;
1108 }
1109
1110 static struct genl_ops dp_flow_genl_ops[] = {
1111         { .cmd = OVS_FLOW_CMD_NEW,
1112           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1113           .policy = flow_policy,
1114           .doit = ovs_flow_cmd_new_or_set
1115         },
1116         { .cmd = OVS_FLOW_CMD_DEL,
1117           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1118           .policy = flow_policy,
1119           .doit = ovs_flow_cmd_del
1120         },
1121         { .cmd = OVS_FLOW_CMD_GET,
1122           .flags = 0,               /* OK for unprivileged users. */
1123           .policy = flow_policy,
1124           .doit = ovs_flow_cmd_get,
1125           .dumpit = ovs_flow_cmd_dump
1126         },
1127         { .cmd = OVS_FLOW_CMD_SET,
1128           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1129           .policy = flow_policy,
1130           .doit = ovs_flow_cmd_new_or_set,
1131         },
1132 };
1133
1134 static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1135         [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1136         [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1137 };
1138
1139 static struct genl_family dp_datapath_genl_family = {
1140         .id = GENL_ID_GENERATE,
1141         .hdrsize = sizeof(struct ovs_header),
1142         .name = OVS_DATAPATH_FAMILY,
1143         .version = OVS_DATAPATH_VERSION,
1144         .maxattr = OVS_DP_ATTR_MAX
1145 };
1146
1147 static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
1148         .name = OVS_DATAPATH_MCGROUP
1149 };
1150
1151 static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1152                                 u32 pid, u32 seq, u32 flags, u8 cmd)
1153 {
1154         struct ovs_header *ovs_header;
1155         struct ovs_dp_stats dp_stats;
1156         int err;
1157
1158         ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
1159                                    flags, cmd);
1160         if (!ovs_header)
1161                 goto error;
1162
1163         ovs_header->dp_ifindex = get_dpifindex(dp);
1164
1165         rcu_read_lock();
1166         err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1167         rcu_read_unlock();
1168         if (err)
1169                 goto nla_put_failure;
1170
1171         get_dp_stats(dp, &dp_stats);
1172         NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
1173
1174         return genlmsg_end(skb, ovs_header);
1175
1176 nla_put_failure:
1177         genlmsg_cancel(skb, ovs_header);
1178 error:
1179         return -EMSGSIZE;
1180 }
1181
1182 static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
1183                                              u32 seq, u8 cmd)
1184 {
1185         struct sk_buff *skb;
1186         int retval;
1187
1188         skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1189         if (!skb)
1190                 return ERR_PTR(-ENOMEM);
1191
1192         retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
1193         if (retval < 0) {
1194                 kfree_skb(skb);
1195                 return ERR_PTR(retval);
1196         }
1197         return skb;
1198 }
1199
1200 /* Called with genl_mutex and optionally with RTNL lock also. */
1201 static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
1202                                         struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1203 {
1204         struct datapath *dp;
1205
1206         if (!a[OVS_DP_ATTR_NAME])
1207                 dp = get_dp(ovs_header->dp_ifindex);
1208         else {
1209                 struct vport *vport;
1210
1211                 rcu_read_lock();
1212                 vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
1213                 dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1214                 rcu_read_unlock();
1215         }
1216         return dp ? dp : ERR_PTR(-ENODEV);
1217 }
1218
1219 static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1220 {
1221         struct nlattr **a = info->attrs;
1222         struct vport_parms parms;
1223         struct sk_buff *reply;
1224         struct datapath *dp;
1225         struct vport *vport;
1226         int err;
1227
1228         err = -EINVAL;
1229         if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1230                 goto err;
1231
1232         rtnl_lock();
1233         err = -ENODEV;
1234         if (!try_module_get(THIS_MODULE))
1235                 goto err_unlock_rtnl;
1236
1237         err = -ENOMEM;
1238         dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1239         if (dp == NULL)
1240                 goto err_put_module;
1241         INIT_LIST_HEAD(&dp->port_list);
1242
1243         /* Allocate table. */
1244         err = -ENOMEM;
1245         rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
1246         if (!dp->table)
1247                 goto err_free_dp;
1248
1249         dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
1250         if (!dp->stats_percpu) {
1251                 err = -ENOMEM;
1252                 goto err_destroy_table;
1253         }
1254
1255         /* Set up our datapath device. */
1256         parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1257         parms.type = OVS_VPORT_TYPE_INTERNAL;
1258         parms.options = NULL;
1259         parms.dp = dp;
1260         parms.port_no = OVSP_LOCAL;
1261         parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
1262
1263         vport = new_vport(&parms);
1264         if (IS_ERR(vport)) {
1265                 err = PTR_ERR(vport);
1266                 if (err == -EBUSY)
1267                         err = -EEXIST;
1268
1269                 goto err_destroy_percpu;
1270         }
1271
1272         reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1273                                       info->snd_seq, OVS_DP_CMD_NEW);
1274         err = PTR_ERR(reply);
1275         if (IS_ERR(reply))
1276                 goto err_destroy_local_port;
1277
1278         list_add_tail(&dp->list_node, &dps);
1279         rtnl_unlock();
1280
1281         genl_notify(reply, genl_info_net(info), info->snd_pid,
1282                     ovs_dp_datapath_multicast_group.id, info->nlhdr,
1283                     GFP_KERNEL);
1284         return 0;
1285
1286 err_destroy_local_port:
1287         ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1288 err_destroy_percpu:
1289         free_percpu(dp->stats_percpu);
1290 err_destroy_table:
1291         ovs_flow_tbl_destroy(genl_dereference(dp->table));
1292 err_free_dp:
1293         kfree(dp);
1294 err_put_module:
1295         module_put(THIS_MODULE);
1296 err_unlock_rtnl:
1297         rtnl_unlock();
1298 err:
1299         return err;
1300 }
1301
1302 static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1303 {
1304         struct vport *vport, *next_vport;
1305         struct sk_buff *reply;
1306         struct datapath *dp;
1307         int err;
1308
1309         rtnl_lock();
1310         dp = lookup_datapath(info->userhdr, info->attrs);
1311         err = PTR_ERR(dp);
1312         if (IS_ERR(dp))
1313                 goto exit_unlock;
1314
1315         reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1316                                       info->snd_seq, OVS_DP_CMD_DEL);
1317         err = PTR_ERR(reply);
1318         if (IS_ERR(reply))
1319                 goto exit_unlock;
1320
1321         list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
1322                 if (vport->port_no != OVSP_LOCAL)
1323                         ovs_dp_detach_port(vport);
1324
1325         list_del(&dp->list_node);
1326         ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1327
1328         /* rtnl_unlock() will wait until all the references to devices that
1329          * are pending unregistration have been dropped.  We do it here to
1330          * ensure that any internal devices (which contain DP pointers) are
1331          * fully destroyed before freeing the datapath.
1332          */
1333         rtnl_unlock();
1334
1335         call_rcu(&dp->rcu, destroy_dp_rcu);
1336         module_put(THIS_MODULE);
1337
1338         genl_notify(reply, genl_info_net(info), info->snd_pid,
1339                     ovs_dp_datapath_multicast_group.id, info->nlhdr,
1340                     GFP_KERNEL);
1341
1342         return 0;
1343
1344 exit_unlock:
1345         rtnl_unlock();
1346         return err;
1347 }
1348
1349 static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1350 {
1351         struct sk_buff *reply;
1352         struct datapath *dp;
1353         int err;
1354
1355         dp = lookup_datapath(info->userhdr, info->attrs);
1356         if (IS_ERR(dp))
1357                 return PTR_ERR(dp);
1358
1359         reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1360                                       info->snd_seq, OVS_DP_CMD_NEW);
1361         if (IS_ERR(reply)) {
1362                 err = PTR_ERR(reply);
1363                 netlink_set_err(init_net.genl_sock, 0,
1364                                 ovs_dp_datapath_multicast_group.id, err);
1365                 return 0;
1366         }
1367
1368         genl_notify(reply, genl_info_net(info), info->snd_pid,
1369                     ovs_dp_datapath_multicast_group.id, info->nlhdr,
1370                     GFP_KERNEL);
1371
1372         return 0;
1373 }
1374
1375 static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1376 {
1377         struct sk_buff *reply;
1378         struct datapath *dp;
1379
1380         dp = lookup_datapath(info->userhdr, info->attrs);
1381         if (IS_ERR(dp))
1382                 return PTR_ERR(dp);
1383
1384         reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1385                                       info->snd_seq, OVS_DP_CMD_NEW);
1386         if (IS_ERR(reply))
1387                 return PTR_ERR(reply);
1388
1389         return genlmsg_reply(reply, info);
1390 }
1391
1392 static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1393 {
1394         struct datapath *dp;
1395         int skip = cb->args[0];
1396         int i = 0;
1397
1398         list_for_each_entry(dp, &dps, list_node) {
1399                 if (i >= skip &&
1400                     ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
1401                                          cb->nlh->nlmsg_seq, NLM_F_MULTI,
1402                                          OVS_DP_CMD_NEW) < 0)
1403                         break;
1404                 i++;
1405         }
1406
1407         cb->args[0] = i;
1408
1409         return skb->len;
1410 }
1411
1412 static struct genl_ops dp_datapath_genl_ops[] = {
1413         { .cmd = OVS_DP_CMD_NEW,
1414           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1415           .policy = datapath_policy,
1416           .doit = ovs_dp_cmd_new
1417         },
1418         { .cmd = OVS_DP_CMD_DEL,
1419           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1420           .policy = datapath_policy,
1421           .doit = ovs_dp_cmd_del
1422         },
1423         { .cmd = OVS_DP_CMD_GET,
1424           .flags = 0,               /* OK for unprivileged users. */
1425           .policy = datapath_policy,
1426           .doit = ovs_dp_cmd_get,
1427           .dumpit = ovs_dp_cmd_dump
1428         },
1429         { .cmd = OVS_DP_CMD_SET,
1430           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1431           .policy = datapath_policy,
1432           .doit = ovs_dp_cmd_set,
1433         },
1434 };
1435
1436 static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
1437         [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1438         [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
1439         [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
1440         [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
1441         [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1442         [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
1443 };
1444
1445 static struct genl_family dp_vport_genl_family = {
1446         .id = GENL_ID_GENERATE,
1447         .hdrsize = sizeof(struct ovs_header),
1448         .name = OVS_VPORT_FAMILY,
1449         .version = OVS_VPORT_VERSION,
1450         .maxattr = OVS_VPORT_ATTR_MAX
1451 };
1452
1453 struct genl_multicast_group ovs_dp_vport_multicast_group = {
1454         .name = OVS_VPORT_MCGROUP
1455 };
1456
1457 /* Called with RTNL lock or RCU read lock. */
1458 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1459                                    u32 pid, u32 seq, u32 flags, u8 cmd)
1460 {
1461         struct ovs_header *ovs_header;
1462         struct ovs_vport_stats vport_stats;
1463         int err;
1464
1465         ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
1466                                  flags, cmd);
1467         if (!ovs_header)
1468                 return -EMSGSIZE;
1469
1470         ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1471
1472         NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
1473         NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
1474         NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
1475         NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
1476
1477         ovs_vport_get_stats(vport, &vport_stats);
1478         NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1479                 &vport_stats);
1480
1481         err = ovs_vport_get_options(vport, skb);
1482         if (err == -EMSGSIZE)
1483                 goto error;
1484
1485         return genlmsg_end(skb, ovs_header);
1486
1487 nla_put_failure:
1488         err = -EMSGSIZE;
1489 error:
1490         genlmsg_cancel(skb, ovs_header);
1491         return err;
1492 }
1493
1494 /* Called with RTNL lock or RCU read lock. */
1495 struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
1496                                          u32 seq, u8 cmd)
1497 {
1498         struct sk_buff *skb;
1499         int retval;
1500
1501         skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1502         if (!skb)
1503                 return ERR_PTR(-ENOMEM);
1504
1505         retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
1506         if (retval < 0) {
1507                 kfree_skb(skb);
1508                 return ERR_PTR(retval);
1509         }
1510         return skb;
1511 }
1512
1513 /* Called with RTNL lock or RCU read lock. */
1514 static struct vport *lookup_vport(struct ovs_header *ovs_header,
1515                                   struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1516 {
1517         struct datapath *dp;
1518         struct vport *vport;
1519
1520         if (a[OVS_VPORT_ATTR_NAME]) {
1521                 vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
1522                 if (!vport)
1523                         return ERR_PTR(-ENODEV);
1524                 if (ovs_header->dp_ifindex &&
1525                     ovs_header->dp_ifindex != get_dpifindex(vport->dp))
1526                         return ERR_PTR(-ENODEV);
1527                 return vport;
1528         } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1529                 u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1530
1531                 if (port_no >= DP_MAX_PORTS)
1532                         return ERR_PTR(-EFBIG);
1533
1534                 dp = get_dp(ovs_header->dp_ifindex);
1535                 if (!dp)
1536                         return ERR_PTR(-ENODEV);
1537
1538                 vport = rcu_dereference_rtnl(dp->ports[port_no]);
1539                 if (!vport)
1540                         return ERR_PTR(-ENOENT);
1541                 return vport;
1542         } else
1543                 return ERR_PTR(-EINVAL);
1544 }
1545
1546 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1547 {
1548         struct nlattr **a = info->attrs;
1549         struct ovs_header *ovs_header = info->userhdr;
1550         struct vport_parms parms;
1551         struct sk_buff *reply;
1552         struct vport *vport;
1553         struct datapath *dp;
1554         u32 port_no;
1555         int err;
1556
1557         err = -EINVAL;
1558         if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1559             !a[OVS_VPORT_ATTR_UPCALL_PID])
1560                 goto exit;
1561
1562         rtnl_lock();
1563         dp = get_dp(ovs_header->dp_ifindex);
1564         err = -ENODEV;
1565         if (!dp)
1566                 goto exit_unlock;
1567
1568         if (a[OVS_VPORT_ATTR_PORT_NO]) {
1569                 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1570
1571                 err = -EFBIG;
1572                 if (port_no >= DP_MAX_PORTS)
1573                         goto exit_unlock;
1574
1575                 vport = rtnl_dereference(dp->ports[port_no]);
1576                 err = -EBUSY;
1577                 if (vport)
1578                         goto exit_unlock;
1579         } else {
1580                 for (port_no = 1; ; port_no++) {
1581                         if (port_no >= DP_MAX_PORTS) {
1582                                 err = -EFBIG;
1583                                 goto exit_unlock;
1584                         }
1585                         vport = rtnl_dereference(dp->ports[port_no]);
1586                         if (!vport)
1587                                 break;
1588                 }
1589         }
1590
1591         parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1592         parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1593         parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1594         parms.dp = dp;
1595         parms.port_no = port_no;
1596         parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1597
1598         vport = new_vport(&parms);
1599         err = PTR_ERR(vport);
1600         if (IS_ERR(vport))
1601                 goto exit_unlock;
1602
1603         reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1604                                          OVS_VPORT_CMD_NEW);
1605         if (IS_ERR(reply)) {
1606                 err = PTR_ERR(reply);
1607                 ovs_dp_detach_port(vport);
1608                 goto exit_unlock;
1609         }
1610         genl_notify(reply, genl_info_net(info), info->snd_pid,
1611                     ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1612
1613 exit_unlock:
1614         rtnl_unlock();
1615 exit:
1616         return err;
1617 }
1618
1619 static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
1620 {
1621         struct nlattr **a = info->attrs;
1622         struct sk_buff *reply;
1623         struct vport *vport;
1624         int err;
1625
1626         rtnl_lock();
1627         vport = lookup_vport(info->userhdr, a);
1628         err = PTR_ERR(vport);
1629         if (IS_ERR(vport))
1630                 goto exit_unlock;
1631
1632         err = 0;
1633         if (a[OVS_VPORT_ATTR_TYPE] &&
1634             nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
1635                 err = -EINVAL;
1636
1637         if (!err && a[OVS_VPORT_ATTR_OPTIONS])
1638                 err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
1639         if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
1640                 vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1641
1642         reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1643                                          OVS_VPORT_CMD_NEW);
1644         if (IS_ERR(reply)) {
1645                 err = PTR_ERR(reply);
1646                 netlink_set_err(init_net.genl_sock, 0,
1647                                 ovs_dp_vport_multicast_group.id, err);
1648                 return 0;
1649         }
1650
1651         genl_notify(reply, genl_info_net(info), info->snd_pid,
1652                     ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1653
1654 exit_unlock:
1655         rtnl_unlock();
1656         return err;
1657 }
1658
1659 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
1660 {
1661         struct nlattr **a = info->attrs;
1662         struct sk_buff *reply;
1663         struct vport *vport;
1664         int err;
1665
1666         rtnl_lock();
1667         vport = lookup_vport(info->userhdr, a);
1668         err = PTR_ERR(vport);
1669         if (IS_ERR(vport))
1670                 goto exit_unlock;
1671
1672         if (vport->port_no == OVSP_LOCAL) {
1673                 err = -EINVAL;
1674                 goto exit_unlock;
1675         }
1676
1677         reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1678                                          OVS_VPORT_CMD_DEL);
1679         err = PTR_ERR(reply);
1680         if (IS_ERR(reply))
1681                 goto exit_unlock;
1682
1683         ovs_dp_detach_port(vport);
1684
1685         genl_notify(reply, genl_info_net(info), info->snd_pid,
1686                     ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1687
1688 exit_unlock:
1689         rtnl_unlock();
1690         return err;
1691 }
1692
1693 static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
1694 {
1695         struct nlattr **a = info->attrs;
1696         struct ovs_header *ovs_header = info->userhdr;
1697         struct sk_buff *reply;
1698         struct vport *vport;
1699         int err;
1700
1701         rcu_read_lock();
1702         vport = lookup_vport(ovs_header, a);
1703         err = PTR_ERR(vport);
1704         if (IS_ERR(vport))
1705                 goto exit_unlock;
1706
1707         reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1708                                          OVS_VPORT_CMD_NEW);
1709         err = PTR_ERR(reply);
1710         if (IS_ERR(reply))
1711                 goto exit_unlock;
1712
1713         rcu_read_unlock();
1714
1715         return genlmsg_reply(reply, info);
1716
1717 exit_unlock:
1718         rcu_read_unlock();
1719         return err;
1720 }
1721
1722 static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1723 {
1724         struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1725         struct datapath *dp;
1726         u32 port_no;
1727         int retval;
1728
1729         dp = get_dp(ovs_header->dp_ifindex);
1730         if (!dp)
1731                 return -ENODEV;
1732
1733         rcu_read_lock();
1734         for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
1735                 struct vport *vport;
1736
1737                 vport = rcu_dereference(dp->ports[port_no]);
1738                 if (!vport)
1739                         continue;
1740
1741                 if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
1742                                             cb->nlh->nlmsg_seq, NLM_F_MULTI,
1743                                             OVS_VPORT_CMD_NEW) < 0)
1744                         break;
1745         }
1746         rcu_read_unlock();
1747
1748         cb->args[0] = port_no;
1749         retval = skb->len;
1750
1751         return retval;
1752 }
1753
1754 static void rehash_flow_table(struct work_struct *work)
1755 {
1756         struct datapath *dp;
1757
1758         genl_lock();
1759
1760         list_for_each_entry(dp, &dps, list_node) {
1761                 struct flow_table *old_table = genl_dereference(dp->table);
1762                 struct flow_table *new_table;
1763
1764                 new_table = ovs_flow_tbl_rehash(old_table);
1765                 if (!IS_ERR(new_table)) {
1766                         rcu_assign_pointer(dp->table, new_table);
1767                         ovs_flow_tbl_deferred_destroy(old_table);
1768                 }
1769         }
1770
1771         genl_unlock();
1772
1773         schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1774 }
1775
1776 static struct genl_ops dp_vport_genl_ops[] = {
1777         { .cmd = OVS_VPORT_CMD_NEW,
1778           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1779           .policy = vport_policy,
1780           .doit = ovs_vport_cmd_new
1781         },
1782         { .cmd = OVS_VPORT_CMD_DEL,
1783           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1784           .policy = vport_policy,
1785           .doit = ovs_vport_cmd_del
1786         },
1787         { .cmd = OVS_VPORT_CMD_GET,
1788           .flags = 0,               /* OK for unprivileged users. */
1789           .policy = vport_policy,
1790           .doit = ovs_vport_cmd_get,
1791           .dumpit = ovs_vport_cmd_dump
1792         },
1793         { .cmd = OVS_VPORT_CMD_SET,
1794           .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1795           .policy = vport_policy,
1796           .doit = ovs_vport_cmd_set,
1797         },
1798 };
1799
1800 struct genl_family_and_ops {
1801         struct genl_family *family;
1802         struct genl_ops *ops;
1803         int n_ops;
1804         struct genl_multicast_group *group;
1805 };
1806
1807 static const struct genl_family_and_ops dp_genl_families[] = {
1808         { &dp_datapath_genl_family,
1809           dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
1810           &ovs_dp_datapath_multicast_group },
1811         { &dp_vport_genl_family,
1812           dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
1813           &ovs_dp_vport_multicast_group },
1814         { &dp_flow_genl_family,
1815           dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
1816           &ovs_dp_flow_multicast_group },
1817         { &dp_packet_genl_family,
1818           dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
1819           NULL },
1820 };
1821
1822 static void dp_unregister_genl(int n_families)
1823 {
1824         int i;
1825
1826         for (i = 0; i < n_families; i++)
1827                 genl_unregister_family(dp_genl_families[i].family);
1828 }
1829
1830 static int dp_register_genl(void)
1831 {
1832         int n_registered;
1833         int err;
1834         int i;
1835
1836         n_registered = 0;
1837         for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
1838                 const struct genl_family_and_ops *f = &dp_genl_families[i];
1839
1840                 err = genl_register_family_with_ops(f->family, f->ops,
1841                                                     f->n_ops);
1842                 if (err)
1843                         goto error;
1844                 n_registered++;
1845
1846                 if (f->group) {
1847                         err = genl_register_mc_group(f->family, f->group);
1848                         if (err)
1849                                 goto error;
1850                 }
1851         }
1852
1853         return 0;
1854
1855 error:
1856         dp_unregister_genl(n_registered);
1857         return err;
1858 }
1859
1860 static int __init dp_init(void)
1861 {
1862         struct sk_buff *dummy_skb;
1863         int err;
1864
1865         BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
1866
1867         pr_info("Open vSwitch switching datapath\n");
1868
1869         err = ovs_flow_init();
1870         if (err)
1871                 goto error;
1872
1873         err = ovs_vport_init();
1874         if (err)
1875                 goto error_flow_exit;
1876
1877         err = register_netdevice_notifier(&ovs_dp_device_notifier);
1878         if (err)
1879                 goto error_vport_exit;
1880
1881         err = dp_register_genl();
1882         if (err < 0)
1883                 goto error_unreg_notifier;
1884
1885         schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1886
1887         return 0;
1888
1889 error_unreg_notifier:
1890         unregister_netdevice_notifier(&ovs_dp_device_notifier);
1891 error_vport_exit:
1892         ovs_vport_exit();
1893 error_flow_exit:
1894         ovs_flow_exit();
1895 error:
1896         return err;
1897 }
1898
1899 static void dp_cleanup(void)
1900 {
1901         cancel_delayed_work_sync(&rehash_flow_wq);
1902         rcu_barrier();
1903         dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
1904         unregister_netdevice_notifier(&ovs_dp_device_notifier);
1905         ovs_vport_exit();
1906         ovs_flow_exit();
1907 }
1908
1909 module_init(dp_init);
1910 module_exit(dp_cleanup);
1911
1912 MODULE_DESCRIPTION("Open vSwitch switching datapath");
1913 MODULE_LICENSE("GPL");