- Update Xen patches to 3.3-rc5 and c/s 1157.
[linux-flexiantxendom0-3.2.10.git] / drivers / xen / blktap / blktap.c
1 /******************************************************************************
2  * drivers/xen/blktap/blktap.c
3  * 
4  * Back-end driver for user level virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. Requests
7  * are remapped to a user-space memory region.
8  *
9  * Based on the blkback driver code.
10  * 
11  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12  *
13  * Clean ups and fix ups:
14  *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
15  *
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License version 2
18  * as published by the Free Software Foundation; or, when distributed
19  * separately from the Linux kernel or incorporated into other
20  * software packages, subject to the following license:
21  * 
22  * Permission is hereby granted, free of charge, to any person obtaining a copy
23  * of this source file (the "Software"), to deal in the Software without
24  * restriction, including without limitation the rights to use, copy, modify,
25  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26  * and to permit persons to whom the Software is furnished to do so, subject to
27  * the following conditions:
28  * 
29  * The above copyright notice and this permission notice shall be included in
30  * all copies or substantial portions of the Software.
31  * 
32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38  * IN THE SOFTWARE.
39  */
40
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/freezer.h>
44 #include <linux/list.h>
45 #include <linux/module.h>
46 #include <asm/hypervisor.h>
47 #include "common.h"
48 #include <xen/balloon.h>
49 #include <xen/driver_util.h>
50 #include <xen/evtchn.h>
51 #include <xen/gnttab.h>
52 #include <linux/kernel.h>
53 #include <linux/fs.h>
54 #include <linux/mm.h>
55 #include <linux/errno.h>
56 #include <linux/major.h>
57 #include <linux/gfp.h>
58 #include <linux/poll.h>
59 #include <linux/delay.h>
60 #include <linux/nsproxy.h>
61 #include <asm/tlbflush.h>
62
63 #define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
64 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
65
66 /*
67  * The maximum number of requests that can be outstanding at any time
68  * is determined by 
69  *
70  *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
71  *
72  * where mmap_alloc < MAX_DYNAMIC_MEM.
73  *
74  * TODO:
75  * mmap_alloc is initialised to 2 and should be adjustable on the fly via
76  * sysfs.
77  */
78 #define BLK_RING_SIZE           __CONST_RING_SIZE(blkif, PAGE_SIZE)
79 #define MAX_DYNAMIC_MEM         BLK_RING_SIZE
80 #define MAX_PENDING_REQS        BLK_RING_SIZE
81 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
82 #define MMAP_VADDR(_start, _req,_seg)                                   \
83         (_start +                                                       \
84          ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
85          ((_seg) * PAGE_SIZE))
86 static int mmap_pages = MMAP_PAGES;
87
88 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
89                       * have a bunch of pages reserved for shared
90                       * memory rings.
91                       */
92
93 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
94 typedef struct domid_translate {
95         unsigned short domid;
96         unsigned short busid;
97 } domid_translate_t ;
98
99 typedef struct domid_translate_ext {
100         unsigned short domid;
101         u32 busid;
102 } domid_translate_ext_t ;
103
104 /*Data struct associated with each of the tapdisk devices*/
105 typedef struct tap_blkif {
106         struct mm_struct *mm;         /*User address space                   */
107         unsigned long rings_vstart;   /*Kernel memory mapping                */
108         unsigned long user_vstart;    /*User memory mapping                  */
109         unsigned long dev_inuse;      /*One process opens device at a time.  */
110         unsigned long dev_pending;    /*In process of being opened           */
111         unsigned long ring_ok;        /*make this ring->state                */
112         blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
113         wait_queue_head_t wait;       /*for poll                             */
114         unsigned long mode;           /*current switching mode               */
115         int minor;                    /*Minor number for tapdisk device      */
116         pid_t pid;                    /*tapdisk process id                   */
117         struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
118         enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
119                                                   shutdown                   */
120         spinlock_t map_lock;          /*protects idx_map                     */
121         struct idx_map {
122                 u16 mem, req;
123         } *idx_map;                   /*Record the user ring id to kern
124                                         [req id, idx] tuple                  */
125         blkif_t *blkif;               /*Associate blkif with tapdev          */
126         struct domid_translate_ext trans; /*Translation from domid to bus.   */
127         struct vm_foreign_map foreign_map;    /*Mapping page */
128 } tap_blkif_t;
129
130 static struct tap_blkif *tapfds[MAX_TAP_DEV];
131 static int blktap_next_minor;
132
133 /* Run-time switchable: /sys/module/blktap/parameters/ */
134 static unsigned int log_stats = 0;
135 static unsigned int debug_lvl = 0;
136 module_param(log_stats, int, 0644);
137 module_param(debug_lvl, int, 0644);
138
139 /*
140  * Each outstanding request that we've passed to the lower device layers has a 
141  * 'pending_req' allocated to it.
142  */
143 typedef struct {
144         blkif_t       *blkif;
145         u64            id;
146         unsigned short mem_idx;
147         unsigned short nr_pages;
148         struct list_head free_list;
149 } pending_req_t;
150
151 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
152 static struct list_head pending_free;
153 static DEFINE_SPINLOCK(pending_free_lock);
154 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
155 static int alloc_pending_reqs;
156
157 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
158         return (req - pending_reqs[idx]);
159 }
160
161 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
162
163 #define BLKBACK_INVALID_HANDLE (~0)
164
165 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
166 static inline struct page *idx_to_page(
167         unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
168 {
169         unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
170         return foreign_pages[mmap_idx][arr_idx];
171 }
172 static inline unsigned long idx_to_kaddr(
173         unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
174 {
175         unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx));
176         return (unsigned long)pfn_to_kaddr(pfn);
177 }
178
179 static unsigned short mmap_alloc = 0;
180 static unsigned short mmap_lock = 0;
181 static unsigned short mmap_inuse = 0;
182
183 /******************************************************************
184  * GRANT HANDLES
185  */
186
187 /* When using grant tables to map a frame for device access then the
188  * handle returned must be used to unmap the frame. This is needed to
189  * drop the ref count on the frame.
190  */
191 struct grant_handle_pair
192 {
193         grant_handle_t kernel;
194         grant_handle_t user;
195 };
196 #define INVALID_GRANT_HANDLE    0xFFFF
197
198 static struct grant_handle_pair 
199     pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
200 #define pending_handle(_id, _idx, _i) \
201     (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
202     + (_i)])
203
204
205 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
206
207 #define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
208 #define BLKTAP_DEV_DIR  "/dev/xen"
209
210 static int blktap_major;
211
212 /* blktap IOCTLs: */
213 #define BLKTAP_IOCTL_KICK_FE         1
214 #define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
215 #define BLKTAP_IOCTL_SETMODE         3
216 #define BLKTAP_IOCTL_SENDPID         4
217 #define BLKTAP_IOCTL_NEWINTF         5
218 #define BLKTAP_IOCTL_MINOR           6
219 #define BLKTAP_IOCTL_MAJOR           7
220 #define BLKTAP_QUERY_ALLOC_REQS      8
221 #define BLKTAP_IOCTL_FREEINTF        9
222 #define BLKTAP_IOCTL_NEWINTF_EXT     50
223 #define BLKTAP_IOCTL_PRINT_IDXS      100  
224
225 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
226 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
227 #define BLKTAP_MODE_INTERCEPT_FE     0x00000001
228 #define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
229
230 #define BLKTAP_MODE_INTERPOSE \
231            (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
232
233
234 static inline int BLKTAP_MODE_VALID(unsigned long arg)
235 {
236         return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
237                 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
238                 (arg == BLKTAP_MODE_INTERPOSE   ));
239 }
240
241 /* Requests passing through the tap to userspace are re-assigned an ID.
242  * We must record a mapping between the BE [IDX,ID] tuple and the userspace
243  * ring ID. 
244  */
245
246 #define INVALID_MIDX 0xdead
247
248 /*TODO: Convert to a free list*/
249 static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map)
250 {
251         unsigned int i;
252
253         for (i = 0; i < MAX_PENDING_REQS; i++)
254                 if (idx_map[i].mem == INVALID_MIDX)
255                         break;
256
257         return i;
258 }
259
260 static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset)
261 {
262         return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
263 }
264
265 static inline unsigned int OFFSET_TO_SEG(unsigned long offset)
266 {
267         return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
268 }
269
270
271 #define BLKTAP_INVALID_HANDLE(_g) \
272     (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
273      ((_g->user) == INVALID_GRANT_HANDLE))
274
275 #define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
276     (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
277     } while(0)
278
279
280 static char *blktap_devnode(struct device *dev, umode_t *mode)
281 {
282         return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
283 }
284
285 static struct device_type blktap_type = {
286         .devnode = blktap_devnode
287 };
288
289 /******************************************************************
290  * BLKTAP VM OPS
291  */
292
293 static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
294 {
295         /*
296          * if the page has not been mapped in by the driver then return
297          * VM_FAULT_SIGBUS to the domain.
298          */
299
300         return VM_FAULT_SIGBUS;
301 }
302
303 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
304                               unsigned long uvaddr,
305                               pte_t *ptep, int is_fullmm)
306 {
307         pte_t copy;
308         tap_blkif_t *info = NULL;
309         unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0;
310         unsigned long offset;
311         struct page *pg;
312         struct grant_handle_pair *khandle;
313         struct gnttab_unmap_grant_ref unmap[2];
314
315         /*
316          * If the address is before the start of the grant mapped region or
317          * if vm_file is NULL (meaning mmap failed and we have nothing to do)
318          */
319         if (vma->vm_file != NULL)
320                 info = vma->vm_file->private_data;
321         if (info == NULL || uvaddr < info->user_vstart)
322                 return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
323                                                    is_fullmm);
324
325         offset = (uvaddr - info->user_vstart) >> PAGE_SHIFT;
326         usr_idx = OFFSET_TO_USR_IDX(offset);
327         seg = OFFSET_TO_SEG(offset);
328
329         spin_lock(&info->map_lock);
330
331         pending_idx = info->idx_map[usr_idx].req;
332         mmap_idx = info->idx_map[usr_idx].mem;
333
334         /* fast_flush_area() may already have cleared this entry */
335         if (mmap_idx == INVALID_MIDX) {
336                 spin_unlock(&info->map_lock);
337                 return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
338                                                    is_fullmm);
339         }
340
341         pg = idx_to_page(mmap_idx, pending_idx, seg);
342         ClearPageReserved(pg);
343         info->foreign_map.map[offset + RING_PAGES] = NULL;
344
345         khandle = &pending_handle(mmap_idx, pending_idx, seg);
346
347         if (khandle->kernel != INVALID_GRANT_HANDLE) {
348                 unsigned long pfn = page_to_pfn(pg);
349
350                 gnttab_set_unmap_op(&unmap[count],
351                                     (unsigned long)pfn_to_kaddr(pfn),
352                                     GNTMAP_host_map, khandle->kernel);
353                 count++;
354
355                 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
356         }
357
358         if (khandle->user != INVALID_GRANT_HANDLE) {
359                 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
360
361                 copy = *ptep;
362                 gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
363                                     GNTMAP_host_map 
364                                     | GNTMAP_application_map 
365                                     | GNTMAP_contains_pte,
366                                     khandle->user);
367                 count++;
368         } else {
369                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
370
371                 /* USING SHADOW PAGE TABLES. */
372                 copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
373                                                    is_fullmm);
374         }
375
376         if (count) {
377                 BLKTAP_INVALIDATE_HANDLE(khandle);
378                 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
379                                               unmap, count))
380                         BUG();
381         }
382
383         spin_unlock(&info->map_lock);
384
385         return copy;
386 }
387
388 static void blktap_vma_open(struct vm_area_struct *vma)
389 {
390         tap_blkif_t *info;
391         if (vma->vm_file == NULL)
392                 return;
393
394         info = vma->vm_file->private_data;
395         vma->vm_private_data =
396                 &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
397 }
398
399 /* tricky part
400  * When partial munmapping, ->open() is called only splitted vma which
401  * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
402  * So there is no chance to fix up vm_private_data of the end vma.
403  */
404 static void blktap_vma_close(struct vm_area_struct *vma)
405 {
406         tap_blkif_t *info;
407         struct vm_area_struct *next = vma->vm_next;
408
409         if (next == NULL ||
410             vma->vm_ops != next->vm_ops ||
411             vma->vm_end != next->vm_start ||
412             vma->vm_file == NULL ||
413             vma->vm_file != next->vm_file)
414                 return;
415
416         info = vma->vm_file->private_data;
417         next->vm_private_data =
418                 &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
419 }
420
421 static struct vm_operations_struct blktap_vm_ops = {
422         fault:    blktap_fault,
423         zap_pte:  blktap_clear_pte,
424         open:     blktap_vma_open,
425         close:    blktap_vma_close,
426 };
427
428 /******************************************************************
429  * BLKTAP FILE OPS
430  */
431  
432 /*Function Declarations*/
433 static tap_blkif_t *get_next_free_dev(void);
434 static int blktap_open(struct inode *inode, struct file *filp);
435 static int blktap_release(struct inode *inode, struct file *filp);
436 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
437 static long blktap_ioctl(struct file *filp, unsigned int cmd,
438                          unsigned long arg);
439 static unsigned int blktap_poll(struct file *file, poll_table *wait);
440
441 static const struct file_operations blktap_fops = {
442         .owner   = THIS_MODULE,
443         .poll    = blktap_poll,
444         .unlocked_ioctl = blktap_ioctl,
445         .open    = blktap_open,
446         .release = blktap_release,
447         .llseek  = no_llseek,
448         .mmap    = blktap_mmap,
449 };
450
451
452 static tap_blkif_t *get_next_free_dev(void)
453 {
454         tap_blkif_t *info;
455         int minor;
456
457         /*
458          * This is called only from the ioctl, which
459          * means we should always have interrupts enabled.
460          */
461         BUG_ON(irqs_disabled());
462
463         spin_lock_irq(&pending_free_lock);
464
465         /* tapfds[0] is always NULL */
466
467         for (minor = 1; minor < blktap_next_minor; minor++) {
468                 info = tapfds[minor];
469                 /* we could have failed a previous attempt. */
470                 if (!info ||
471                     ((!test_bit(0, &info->dev_inuse)) &&
472                      (info->dev_pending == 0)) ) {
473                         info->dev_pending = 1;
474                         goto found;
475                 }
476         }
477         info = NULL;
478         minor = -1;
479
480         /*
481          * We didn't find free device. If we can still allocate
482          * more, then we grab the next device minor that is
483          * available.  This is done while we are still under
484          * the protection of the pending_free_lock.
485          */
486         if (blktap_next_minor < MAX_TAP_DEV)
487                 minor = blktap_next_minor++;
488 found:
489         spin_unlock_irq(&pending_free_lock);
490
491         if (!info && minor > 0) {
492                 info = kzalloc(sizeof(*info), GFP_KERNEL);
493                 if (unlikely(!info)) {
494                         /*
495                          * If we failed here, try to put back
496                          * the next minor number. But if one
497                          * was just taken, then we just lose this
498                          * minor.  We can try to allocate this
499                          * minor again later.
500                          */
501                         spin_lock_irq(&pending_free_lock);
502                         if (blktap_next_minor == minor+1)
503                                 blktap_next_minor--;
504                         spin_unlock_irq(&pending_free_lock);
505                         goto out;
506                 }
507
508                 info->minor = minor;
509                 spin_lock_init(&info->map_lock);
510                 /*
511                  * Make sure that we have a minor before others can
512                  * see us.
513                  */
514                 wmb();
515                 tapfds[minor] = info;
516
517                 xen_class_device_create(&blktap_type, NULL,
518                                         MKDEV(blktap_major, minor),
519                                         NULL, "blktap%d", minor);
520         }
521
522 out:
523         return info;
524 }
525
526 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
527 {
528         tap_blkif_t *info;
529         int i;
530
531         for (i = 1; i < blktap_next_minor; i++) {
532                 info = tapfds[i];
533                 if ( info &&
534                      (info->trans.domid == domid) &&
535                      (info->trans.busid == xenbus_id) ) {
536                         info->blkif = blkif;
537                         info->status = RUNNING;
538                         return i;
539                 }
540         }
541         return -1;
542 }
543
544 void signal_tapdisk(int idx) 
545 {
546         tap_blkif_t *info;
547         struct task_struct *ptask;
548         struct mm_struct *mm;
549
550         /*
551          * if the userland tools set things up wrong, this could be negative;
552          * just don't try to signal in this case
553          */
554         if (idx < 0 || idx >= MAX_TAP_DEV)
555                 return;
556
557         info = tapfds[idx];
558         if (!info)
559                 return;
560
561         if (info->pid > 0) {
562                 ptask = pid_task(find_pid_ns(info->pid, info->pid_ns),
563                                  PIDTYPE_PID);
564                 if (ptask)
565                         info->status = CLEANSHUTDOWN;
566         }
567         info->blkif = NULL;
568
569         mm = xchg(&info->mm, NULL);
570         if (mm)
571                 mmput(mm);
572 }
573
574 static int blktap_open(struct inode *inode, struct file *filp)
575 {
576         blkif_sring_t *sring;
577         int idx = iminor(inode) - BLKTAP_MINOR;
578         tap_blkif_t *info;
579         int i;
580         
581         nonseekable_open(inode, filp);
582
583         /* ctrl device, treat differently */
584         if (!idx)
585                 return 0;
586         if (idx < 0 || idx >= MAX_TAP_DEV) {
587                 WPRINTK("No device /dev/xen/blktap%d\n", idx);
588                 return -ENODEV;
589         }
590
591         info = tapfds[idx];
592         if (!info) {
593                 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
594                         idx);
595                 return -ENODEV;
596         }
597
598         DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
599         
600         /*Only one process can access device at a time*/
601         if (test_and_set_bit(0, &info->dev_inuse))
602                 return -EBUSY;
603
604         info->dev_pending = 0;
605             
606         /* Allocate the fe ring. */
607         sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
608         if (sring == NULL)
609                 goto fail_nomem;
610
611         SetPageReserved(virt_to_page(sring));
612     
613         SHARED_RING_INIT(sring);
614         FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
615         
616         filp->private_data = info;
617         info->mm = NULL;
618
619         info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS,
620                                 GFP_KERNEL);
621         
622         if (info->idx_map == NULL)
623                 goto fail_nomem;
624
625         if (idx > 0) {
626                 init_waitqueue_head(&info->wait);
627                 for (i = 0; i < MAX_PENDING_REQS; i++) {
628                         info->idx_map[i].mem = INVALID_MIDX;
629                         info->idx_map[i].req = ~0;
630                 }
631         }
632
633         DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
634         return 0;
635
636  fail_nomem:
637         return -ENOMEM;
638 }
639
640 static int blktap_release(struct inode *inode, struct file *filp)
641 {
642         tap_blkif_t *info = filp->private_data;
643         struct mm_struct *mm;
644         
645         /* check for control device */
646         if (!info)
647                 return 0;
648
649         info->ring_ok = 0;
650         smp_wmb();
651         info->rings_vstart = 0;
652
653         mm = xchg(&info->mm, NULL);
654         if (mm)
655                 mmput(mm);
656         kfree(info->foreign_map.map);
657         info->foreign_map.map = NULL;
658
659         /* Free the ring page. */
660         ClearPageReserved(virt_to_page(info->ufe_ring.sring));
661         free_page((unsigned long) info->ufe_ring.sring);
662
663         if (info->idx_map) {
664                 kfree(info->idx_map);
665                 info->idx_map = NULL;
666         }
667
668         if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
669                 if (info->blkif->xenblkd != NULL) {
670                         kthread_stop(info->blkif->xenblkd);
671                         info->blkif->xenblkd = NULL;
672                 }
673                 info->status = CLEANSHUTDOWN;
674         }
675
676         clear_bit(0, &info->dev_inuse);
677         DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
678
679         return 0;
680 }
681
682
683 /* Note on mmap:
684  * We need to map pages to user space in a way that will allow the block
685  * subsystem set up direct IO to them.  This couldn't be done before, because
686  * there isn't really a sane way to translate a user virtual address down to a 
687  * physical address when the page belongs to another domain.
688  *
689  * My first approach was to map the page in to kernel memory, add an entry
690  * for it in the physical frame list (using alloc_lomem_region as in blkback)
691  * and then attempt to map that page up to user space.  This is disallowed
692  * by xen though, which realizes that we don't really own the machine frame
693  * underlying the physical page.
694  *
695  * The new approach is to provide explicit support for this in xen linux.
696  * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
697  * mapped from other vms.  vma->vm_private_data is set up as a mapping 
698  * from pages to actual page structs.  There is a new clause in get_user_pages
699  * that does the right thing for this sort of mapping.
700  */
701 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
702 {
703         int size;
704         tap_blkif_t *info = filp->private_data;
705         int ret;
706
707         if (info == NULL) {
708                 WPRINTK("mmap: no private data?\n");
709                 return -ENOMEM;
710         }
711
712         if (info->rings_vstart) {
713                 WPRINTK("mmap already called on filp %p (minor %d)\n",
714                         filp, info->minor);
715                 return -EPERM;
716         }
717
718         vma->vm_flags |= VM_RESERVED;
719         vma->vm_ops = &blktap_vm_ops;
720
721         size = vma->vm_end - vma->vm_start;
722         if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
723                 WPRINTK("you _must_ map exactly %d pages!\n",
724                        mmap_pages + RING_PAGES);
725                 return -EAGAIN;
726         }
727
728         size >>= PAGE_SHIFT;
729         info->rings_vstart = vma->vm_start;
730         info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
731     
732         /* Map the ring pages to the start of the region and reserve it. */
733         if (xen_feature(XENFEAT_auto_translated_physmap))
734                 ret = vm_insert_page(vma, vma->vm_start,
735                                      virt_to_page(info->ufe_ring.sring));
736         else
737                 ret = remap_pfn_range(vma, vma->vm_start,
738                                       __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
739                                       PAGE_SIZE, vma->vm_page_prot);
740         if (ret) {
741                 WPRINTK("Mapping user ring failed!\n");
742                 goto fail;
743         }
744
745         /* Mark this VM as containing foreign pages, and set up mappings. */
746         info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
747                             sizeof(*info->foreign_map.map), GFP_KERNEL);
748         if (info->foreign_map.map == NULL) {
749                 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
750                 goto fail;
751         }
752
753         vma->vm_private_data = &info->foreign_map;
754         vma->vm_flags |= VM_FOREIGN;
755         vma->vm_flags |= VM_DONTCOPY;
756
757 #ifdef CONFIG_X86
758         vma->vm_mm->context.has_foreign_mappings = 1;
759 #endif
760
761         info->mm = get_task_mm(current);
762         smp_wmb();
763         info->ring_ok = 1;
764         return 0;
765  fail:
766         /* Clear any active mappings. */
767         zap_page_range(vma, vma->vm_start, 
768                        vma->vm_end - vma->vm_start, NULL);
769         info->rings_vstart = 0;
770
771         return -ENOMEM;
772 }
773
774
775 static long blktap_ioctl(struct file *filp, unsigned int cmd,
776                          unsigned long arg)
777 {
778         tap_blkif_t *info = filp->private_data;
779
780         switch(cmd) {
781         case BLKTAP_IOCTL_KICK_FE: 
782         {
783                 /* There are fe messages to process. */
784                 return blktap_read_ufe_ring(info);
785         }
786         case BLKTAP_IOCTL_SETMODE:
787         {
788                 if (info) {
789                         if (BLKTAP_MODE_VALID(arg)) {
790                                 info->mode = arg;
791                                 /* XXX: may need to flush rings here. */
792                                 DPRINTK("set mode to %lx\n", arg);
793                                 return 0;
794                         }
795                 }
796                 return 0;
797         }
798         case BLKTAP_IOCTL_PRINT_IDXS:
799         {
800                 if (info) {
801                         pr_info("User Rings: \n-----------\n");
802                         pr_info("UF: rsp_cons: %2d, req_prod_prv: %2d "
803                                 "| req_prod: %2d, rsp_prod: %2d\n",
804                                 info->ufe_ring.rsp_cons,
805                                 info->ufe_ring.req_prod_pvt,
806                                 info->ufe_ring.sring->req_prod,
807                                 info->ufe_ring.sring->rsp_prod);
808                 }
809                 return 0;
810         }
811         case BLKTAP_IOCTL_SENDPID:
812         {
813                 if (info) {
814                         info->pid = (pid_t)arg;
815                         info->pid_ns = current->nsproxy->pid_ns;
816                         DPRINTK("pid received %p:%d\n",
817                                 info->pid_ns, info->pid);
818                 }
819                 return 0;
820         }
821         case BLKTAP_IOCTL_NEWINTF:
822         {               
823                 uint64_t val = (uint64_t)arg;
824                 domid_translate_t *tr = (domid_translate_t *)&val;
825
826                 DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
827                        tr->domid, tr->busid);
828                 info = get_next_free_dev();
829                 if (!info) {
830                         WPRINTK("Error initialising /dev/xen/blktap - "
831                                 "No more devices\n");
832                         return -1;
833                 }
834                 info->trans.domid = tr->domid;
835                 info->trans.busid = tr->busid;
836                 return info->minor;
837         }
838         case BLKTAP_IOCTL_NEWINTF_EXT:
839         {
840                 void __user *udata = (void __user *) arg;
841                 domid_translate_ext_t tr;
842
843                 if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
844                         return -EFAULT;
845
846                 DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", 
847                        tr.domid, tr.busid);
848                 info = get_next_free_dev();
849                 if (!info) {
850                         WPRINTK("Error initialising /dev/xen/blktap - "
851                                 "No more devices\n");
852                         return -1;
853                 }
854                 info->trans.domid = tr.domid;
855                 info->trans.busid = tr.busid;
856                 return info->minor;
857         }
858         case BLKTAP_IOCTL_FREEINTF:
859         {
860                 unsigned long dev = arg;
861                 unsigned long flags;
862
863                 if (info || dev >= MAX_TAP_DEV)
864                         return -EINVAL;
865
866                 info = tapfds[dev];
867                 if (!info)
868                         return 0; /* should this be an error? */
869
870                 spin_lock_irqsave(&pending_free_lock, flags);
871                 if (info->dev_pending)
872                         info->dev_pending = 0;
873                 spin_unlock_irqrestore(&pending_free_lock, flags);
874
875                 return 0;
876         }
877         case BLKTAP_IOCTL_MINOR:
878                 if (!info) {
879                         unsigned long dev = arg;
880
881                         if (dev >= MAX_TAP_DEV)
882                                 return -EINVAL;
883
884                         info = tapfds[dev];
885                         if (!info)
886                                 return -EINVAL;
887                 }
888
889                 return info->minor;
890
891         case BLKTAP_IOCTL_MAJOR:
892                 return blktap_major;
893
894         case BLKTAP_QUERY_ALLOC_REQS:
895                 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n",
896                         alloc_pending_reqs, MAX_PENDING_REQS);
897                 return (alloc_pending_reqs/MAX_PENDING_REQS) * 100;
898         }
899         return -ENOIOCTLCMD;
900 }
901
902 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
903 {
904         tap_blkif_t *info = filp->private_data;
905         
906         /* do not work on the control device */
907         if (!info)
908                 return 0;
909
910         poll_wait(filp, &info->wait, wait);
911         if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
912                 RING_PUSH_REQUESTS(&info->ufe_ring);
913                 return POLLIN | POLLRDNORM;
914         }
915         return 0;
916 }
917
918 static void blktap_kick_user(int idx)
919 {
920         tap_blkif_t *info;
921
922         if (idx < 0 || idx >= MAX_TAP_DEV)
923                 return;
924
925         info = tapfds[idx];
926         if (!info)
927                 return;
928
929         wake_up_interruptible(&info->wait);
930
931         return;
932 }
933
934 static int do_block_io_op(blkif_t *blkif);
935 static void dispatch_rw_block_io(blkif_t *blkif,
936                                  blkif_request_t *req,
937                                  pending_req_t *pending_req);
938 static void make_response(blkif_t *blkif, u64 id,
939                           unsigned short op, int st);
940
941 /******************************************************************
942  * misc small helpers
943  */
944 static int req_increase(void)
945 {
946         int i, j;
947
948         if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
949                 return -EINVAL;
950
951         pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
952                                             * MAX_PENDING_REQS, GFP_KERNEL);
953         foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
954
955         if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
956                 goto out_of_memory;
957
958         DPRINTK("reqs=%lu, pages=%d\n", MAX_PENDING_REQS, mmap_pages);
959
960         for (i = 0; i < MAX_PENDING_REQS; i++) {
961                 list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
962                               &pending_free);
963                 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
964                 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
965                         BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
966                                                                  i, j));
967         }
968
969         mmap_alloc++;
970         DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
971         return 0;
972
973  out_of_memory:
974         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
975         kfree(pending_reqs[mmap_alloc]);
976         WPRINTK("%s: out of memory\n", __FUNCTION__);
977         return -ENOMEM;
978 }
979
980 static void mmap_req_del(int mmap)
981 {
982         assert_spin_locked(&pending_free_lock);
983
984         kfree(pending_reqs[mmap]);
985         pending_reqs[mmap] = NULL;
986
987         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
988         foreign_pages[mmap] = NULL;
989
990         mmap_lock = 0;
991         DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
992         mmap_alloc--;
993 }
994
995 static pending_req_t* alloc_req(void)
996 {
997         pending_req_t *req = NULL;
998         unsigned long flags;
999
1000         spin_lock_irqsave(&pending_free_lock, flags);
1001
1002         if (!list_empty(&pending_free)) {
1003                 req = list_entry(pending_free.next, pending_req_t, free_list);
1004                 list_del(&req->free_list);
1005         }
1006
1007         if (req)
1008                 alloc_pending_reqs++;
1009         spin_unlock_irqrestore(&pending_free_lock, flags);
1010
1011         return req;
1012 }
1013
1014 static void free_req(pending_req_t *req)
1015 {
1016         unsigned long flags;
1017         int was_empty;
1018
1019         spin_lock_irqsave(&pending_free_lock, flags);
1020
1021         alloc_pending_reqs--;
1022         if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
1023                 mmap_inuse--;
1024                 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
1025                 spin_unlock_irqrestore(&pending_free_lock, flags);
1026                 return;
1027         }
1028         was_empty = list_empty(&pending_free);
1029         list_add(&req->free_list, &pending_free);
1030
1031         spin_unlock_irqrestore(&pending_free_lock, flags);
1032
1033         if (was_empty)
1034                 wake_up(&pending_free_wq);
1035 }
1036
1037 static void blktap_zap_page_range(struct mm_struct *mm,
1038                                   unsigned long uvaddr, int nr_pages)
1039 {
1040         unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
1041         struct vm_area_struct *vma;
1042
1043         vma = find_vma(mm, uvaddr);
1044         while (vma && uvaddr < end) {
1045                 unsigned long s = max(uvaddr, vma->vm_start);
1046                 unsigned long e = min(end, vma->vm_end);
1047
1048                 zap_page_range(vma, s, e - s, NULL);
1049
1050                 uvaddr = e;
1051                 vma = vma->vm_next;
1052         }
1053 }
1054
1055 static void fast_flush_area(pending_req_t *req, unsigned int k_idx,
1056                              unsigned int u_idx, tap_blkif_t *info)
1057 {
1058         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1059         unsigned int i, mmap_idx, invcount = 0;
1060         struct grant_handle_pair *khandle;
1061         uint64_t ptep;
1062         int ret;
1063         unsigned long uvaddr;
1064         struct mm_struct *mm = info->mm;
1065
1066         if (mm != NULL)
1067                 down_read(&mm->mmap_sem);
1068
1069         if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
1070  slow:
1071                 blktap_zap_page_range(mm,
1072                                       MMAP_VADDR(info->user_vstart, u_idx, 0),
1073                                       req->nr_pages);
1074                 info->idx_map[u_idx].mem = INVALID_MIDX;
1075                 up_read(&mm->mmap_sem);
1076                 return;
1077         }
1078
1079         mmap_idx = req->mem_idx;
1080
1081         spin_lock(&info->map_lock);
1082
1083         for (i = 0; i < req->nr_pages; i++) {
1084                 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
1085
1086                 khandle = &pending_handle(mmap_idx, k_idx, i);
1087
1088                 if (khandle->kernel != INVALID_GRANT_HANDLE) {
1089                         gnttab_set_unmap_op(&unmap[invcount],
1090                                             idx_to_kaddr(mmap_idx, k_idx, i),
1091                                             GNTMAP_host_map, khandle->kernel);
1092                         invcount++;
1093
1094                         set_phys_to_machine(
1095                                 page_to_pfn(idx_to_page(mmap_idx, k_idx, i)),
1096                                 INVALID_P2M_ENTRY);
1097                 }
1098
1099                 if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) {
1100                         BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1101                         if (create_lookup_pte_addr(
1102                                 mm,
1103                                 MMAP_VADDR(info->user_vstart, u_idx, i),
1104                                 &ptep) !=0) {
1105                                 spin_unlock(&info->map_lock);
1106                                 WPRINTK("Couldn't get a pte addr!\n");
1107                                 goto slow;
1108                         }
1109
1110                         gnttab_set_unmap_op(&unmap[invcount], ptep,
1111                                             GNTMAP_host_map
1112                                             | GNTMAP_application_map
1113                                             | GNTMAP_contains_pte,
1114                                             khandle->user);
1115                         invcount++;
1116                 }
1117
1118                 BLKTAP_INVALIDATE_HANDLE(khandle);
1119         }
1120         ret = HYPERVISOR_grant_table_op(
1121                 GNTTABOP_unmap_grant_ref, unmap, invcount);
1122         BUG_ON(ret);
1123         
1124         info->idx_map[u_idx].mem = INVALID_MIDX;
1125
1126         spin_unlock(&info->map_lock);
1127         if (mm != NULL)
1128                 up_read(&mm->mmap_sem);
1129 }
1130
1131 /******************************************************************
1132  * SCHEDULER FUNCTIONS
1133  */
1134
1135 static void print_stats(blkif_t *blkif)
1136 {
1137         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d |  pk %4d\n",
1138                current->comm, blkif->st_oo_req,
1139                blkif->st_rd_req, blkif->st_wr_req, blkif->st_pk_req);
1140         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1141         blkif->st_rd_req = 0;
1142         blkif->st_wr_req = 0;
1143         blkif->st_oo_req = 0;
1144         blkif->st_pk_req = 0;
1145 }
1146
1147 int tap_blkif_schedule(void *arg)
1148 {
1149         blkif_t *blkif = arg;
1150         tap_blkif_t *info;
1151
1152         blkif_get(blkif);
1153
1154         if (debug_lvl)
1155                 printk(KERN_DEBUG "%s: started\n", current->comm);
1156
1157         while (!kthread_should_stop()) {
1158                 if (try_to_freeze())
1159                         continue;
1160
1161                 wait_event_interruptible(
1162                         blkif->wq,
1163                         blkif->waiting_reqs || kthread_should_stop());
1164                 wait_event_interruptible(
1165                         pending_free_wq,
1166                         !list_empty(&pending_free) || kthread_should_stop());
1167
1168                 blkif->waiting_reqs = 0;
1169                 smp_mb(); /* clear flag *before* checking for work */
1170
1171                 if (do_block_io_op(blkif))
1172                         blkif->waiting_reqs = 1;
1173
1174                 if (log_stats && time_after(jiffies, blkif->st_print))
1175                         print_stats(blkif);
1176         }
1177
1178         if (log_stats)
1179                 print_stats(blkif);
1180         if (debug_lvl)
1181                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1182
1183         blkif->xenblkd = NULL;
1184         info = tapfds[blkif->dev_num];
1185         blkif_put(blkif);
1186
1187         if (info) {
1188                 struct mm_struct *mm = xchg(&info->mm, NULL);
1189
1190                 if (mm)
1191                         mmput(mm);
1192         }
1193
1194         return 0;
1195 }
1196
1197 /******************************************************************
1198  * COMPLETION CALLBACK -- Called by user level ioctl()
1199  */
1200
1201 static int blktap_read_ufe_ring(tap_blkif_t *info)
1202 {
1203         /* This is called to read responses from the UFE ring. */
1204         RING_IDX i, j, rp;
1205         blkif_response_t *resp;
1206         blkif_t *blkif=NULL;
1207         unsigned int pending_idx, usr_idx, mmap_idx;
1208         pending_req_t *pending_req;
1209         
1210         if (!info)
1211                 return 0;
1212
1213         /* We currently only forward packets in INTERCEPT_FE mode. */
1214         if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1215                 return 0;
1216
1217         /* for each outstanding message on the UFEring  */
1218         rp = info->ufe_ring.sring->rsp_prod;
1219         rmb();
1220         
1221         for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1222                 blkif_response_t res;
1223                 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1224                 memcpy(&res, resp, sizeof(res));
1225                 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1226                 ++info->ufe_ring.rsp_cons;
1227
1228                 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1229                 if (res.id >= MAX_PENDING_REQS) {
1230                         WPRINTK("incorrect req map [%llx]\n",
1231                                 (unsigned long long)res.id);
1232                         continue;
1233                 }
1234
1235                 usr_idx = (unsigned int)res.id;
1236                 pending_idx = info->idx_map[usr_idx].req;
1237                 mmap_idx = info->idx_map[usr_idx].mem;
1238
1239                 if (mmap_idx >= mmap_alloc ||
1240                     pending_idx >= MAX_PENDING_REQS) {
1241                         WPRINTK("incorrect req map [%d],"
1242                                 " internal map [%d,%d]\n",
1243                                 usr_idx, mmap_idx, pending_idx);
1244                         continue;
1245                 }
1246
1247                 pending_req = &pending_reqs[mmap_idx][pending_idx];
1248                 blkif = pending_req->blkif;
1249
1250                 for (j = 0; j < pending_req->nr_pages; j++) {
1251
1252                         unsigned long uvaddr;
1253                         struct page *pg;
1254                         int offset;
1255
1256                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1257
1258                         pg = idx_to_page(mmap_idx, pending_idx, j);
1259                         ClearPageReserved(pg);
1260                         offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1261                         info->foreign_map.map[offset] = NULL;
1262                 }
1263                 fast_flush_area(pending_req, pending_idx, usr_idx, info);
1264                 make_response(blkif, pending_req->id, res.operation,
1265                               res.status);
1266                 blkif_put(pending_req->blkif);
1267                 free_req(pending_req);
1268         }
1269                 
1270         return 0;
1271 }
1272
1273
1274 /******************************************************************************
1275  * NOTIFICATION FROM GUEST OS.
1276  */
1277
1278 static void blkif_notify_work(blkif_t *blkif)
1279 {
1280         blkif->waiting_reqs = 1;
1281         wake_up(&blkif->wq);
1282 }
1283
1284 irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
1285 {
1286         blkif_notify_work(dev_id);
1287         return IRQ_HANDLED;
1288 }
1289
1290
1291
1292 /******************************************************************
1293  * DOWNWARD CALLS -- These interface with the block-device layer proper.
1294  */
1295 static int print_dbug = 1;
1296 static int do_block_io_op(blkif_t *blkif)
1297 {
1298         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1299         blkif_request_t req;
1300         pending_req_t *pending_req;
1301         RING_IDX rc, rp;
1302         int more_to_do = 0;
1303         tap_blkif_t *info;
1304
1305         rc = blk_rings->common.req_cons;
1306         rp = blk_rings->common.sring->req_prod;
1307         rmb(); /* Ensure we see queued requests up to 'rp'. */
1308
1309         /*Check blkif has corresponding UE ring*/
1310         if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) {
1311                 /*oops*/
1312                 if (print_dbug) {
1313                         WPRINTK("Corresponding UE " 
1314                                "ring does not exist!\n");
1315                         print_dbug = 0; /*We only print this message once*/
1316                 }
1317                 return 0;
1318         }
1319
1320         info = tapfds[blkif->dev_num];
1321
1322         if (!info || !test_bit(0, &info->dev_inuse)) {
1323                 if (print_dbug) {
1324                         WPRINTK("Can't get UE info!\n");
1325                         print_dbug = 0;
1326                 }
1327                 return 0;
1328         }
1329
1330         while (rc != rp) {
1331                 
1332                 if (RING_FULL(&info->ufe_ring)) {
1333                         WPRINTK("RING_FULL! More to do\n");
1334                         more_to_do = 1;
1335                         break;
1336                 }
1337
1338                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1339                         WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1340                                " More to do\n");
1341                         more_to_do = 1;
1342                         break;          
1343                 }
1344
1345                 if (kthread_should_stop()) {
1346                         more_to_do = 1;
1347                         break;
1348                 }
1349
1350                 pending_req = alloc_req();
1351                 if (NULL == pending_req) {
1352                         blkif->st_oo_req++;
1353                         more_to_do = 1;
1354                         break;
1355                 }
1356
1357                 switch (blkif->blk_protocol) {
1358                 case BLKIF_PROTOCOL_NATIVE:
1359                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1360                                sizeof(req));
1361                         break;
1362                 case BLKIF_PROTOCOL_X86_32:
1363                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1364                         break;
1365                 case BLKIF_PROTOCOL_X86_64:
1366                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1367                         break;
1368                 default:
1369                         BUG();
1370                 }
1371                 blk_rings->common.req_cons = ++rc; /* before make_response() */
1372
1373                 /* Apply all sanity checks to /private copy/ of request. */
1374                 barrier();
1375
1376                 switch (req.operation) {
1377                 case BLKIF_OP_READ:
1378                         blkif->st_rd_req++;
1379                         dispatch_rw_block_io(blkif, &req, pending_req);
1380                         break;
1381
1382                 case BLKIF_OP_WRITE_BARRIER:
1383                         /* TODO Some counter? */
1384                         /* Fall through */
1385                 case BLKIF_OP_WRITE:
1386                         blkif->st_wr_req++;
1387                         dispatch_rw_block_io(blkif, &req, pending_req);
1388                         break;
1389
1390                 case BLKIF_OP_PACKET:
1391                         blkif->st_pk_req++;
1392                         dispatch_rw_block_io(blkif, &req, pending_req);
1393                         break;
1394
1395                 default:
1396                         /* A good sign something is wrong: sleep for a while to
1397                          * avoid excessive CPU consumption by a bad guest. */
1398                         msleep(1);
1399                         WPRINTK("unknown operation [%d]\n",
1400                                 req.operation);
1401                         make_response(blkif, req.id, req.operation,
1402                                       BLKIF_RSP_ERROR);
1403                         free_req(pending_req);
1404                         break;
1405                 }
1406
1407                 /* Yield point for this unbounded loop. */
1408                 cond_resched();
1409         }
1410                 
1411         blktap_kick_user(blkif->dev_num);
1412
1413         return more_to_do;
1414 }
1415
1416 static void dispatch_rw_block_io(blkif_t *blkif,
1417                                  blkif_request_t *req,
1418                                  pending_req_t *pending_req)
1419 {
1420         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1421         unsigned int nseg;
1422         int ret, i, op, nr_sects = 0;
1423         tap_blkif_t *info;
1424         blkif_request_t *target;
1425         unsigned int mmap_idx = pending_req->mem_idx;
1426         unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx);
1427         unsigned int usr_idx;
1428         uint32_t flags;
1429         struct mm_struct *mm;
1430         struct vm_area_struct *vma = NULL;
1431
1432         if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV)
1433                 goto fail_response;
1434
1435         info = tapfds[blkif->dev_num];
1436         if (info == NULL)
1437                 goto fail_response;
1438
1439         /* Check we have space on user ring - should never fail. */
1440         spin_lock(&info->map_lock);
1441         usr_idx = GET_NEXT_REQ(info->idx_map);
1442         spin_unlock(&info->map_lock);
1443         if (usr_idx >= MAX_PENDING_REQS) {
1444                 WARN_ON(1);
1445                 goto fail_response;
1446         }
1447
1448         /* Check that number of segments is sane. */
1449         nseg = req->nr_segments;
1450         if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
1451             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1452                 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1453                 goto fail_response;
1454         }
1455         
1456         /* Make sure userspace is ready. */
1457         if (!info->ring_ok) {
1458                 WPRINTK("ring not ready for requests!\n");
1459                 goto fail_response;
1460         }
1461         smp_rmb();
1462
1463         if (RING_FULL(&info->ufe_ring)) {
1464                 WPRINTK("fe_ring is full, "
1465                         "IO Request will be dropped. %d %d\n",
1466                         RING_SIZE(&info->ufe_ring),
1467                         RING_SIZE(&blkif->blk_rings.common));
1468                 goto fail_response;
1469         }
1470
1471         pending_req->blkif     = blkif;
1472         pending_req->id        = req->id;
1473         pending_req->nr_pages  = nseg;
1474
1475         flags = GNTMAP_host_map;
1476         switch (req->operation) {
1477         case BLKIF_OP_WRITE:
1478         case BLKIF_OP_WRITE_BARRIER:
1479                 flags |= GNTMAP_readonly;
1480                 break;
1481         }
1482
1483         op = 0;
1484         mm = info->mm;
1485         if (!xen_feature(XENFEAT_auto_translated_physmap))
1486                 down_read(&mm->mmap_sem);
1487         for (i = 0; i < nseg; i++) {
1488                 unsigned long uvaddr;
1489                 unsigned long kvaddr;
1490                 uint64_t ptep;
1491
1492                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1493                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1494
1495                 gnttab_set_map_op(&map[op], kvaddr, flags,
1496                                   req->seg[i].gref, blkif->domid);
1497                 op++;
1498
1499                 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1500                         /* Now map it to user. */
1501                         ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
1502                         if (ret) {
1503                                 up_read(&mm->mmap_sem);
1504                                 WPRINTK("Couldn't get a pte addr!\n");
1505                                 goto fail_response;
1506                         }
1507
1508                         gnttab_set_map_op(&map[op], ptep,
1509                                           flags | GNTMAP_application_map
1510                                                 | GNTMAP_contains_pte,
1511                                           req->seg[i].gref, blkif->domid);
1512                         op++;
1513                 }
1514
1515                 nr_sects += (req->seg[i].last_sect - 
1516                              req->seg[i].first_sect + 1);
1517         }
1518
1519         if (xen_feature(XENFEAT_auto_translated_physmap))
1520                 down_read(&mm->mmap_sem);
1521
1522         spin_lock(&info->map_lock);
1523
1524         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1525         BUG_ON(ret);
1526
1527         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1528                 for (i = 0; i < (nseg*2); i+=2) {
1529                         unsigned long uvaddr;
1530                         unsigned long offset;
1531                         struct page *pg;
1532
1533                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1534
1535                         gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
1536
1537                         if (unlikely(map[i].status != GNTST_okay)) {
1538                                 WPRINTK("invalid kernel buffer -- could not remap it\n");
1539                                 ret = 1;
1540                                 map[i].handle = INVALID_GRANT_HANDLE;
1541                         }
1542
1543                         if (unlikely(map[i+1].status != GNTST_okay)) {
1544                                 WPRINTK("invalid user buffer -- could not remap it\n");
1545                                 ret = 1;
1546                                 map[i+1].handle = INVALID_GRANT_HANDLE;
1547                         }
1548
1549                         pending_handle(mmap_idx, pending_idx, i/2).kernel 
1550                                 = map[i].handle;
1551                         pending_handle(mmap_idx, pending_idx, i/2).user   
1552                                 = map[i+1].handle;
1553
1554                         if (ret)
1555                                 continue;
1556
1557                         pg = idx_to_page(mmap_idx, pending_idx, i/2);
1558                         set_phys_to_machine(page_to_pfn(pg),
1559                                             FOREIGN_FRAME(map[i].dev_bus_addr
1560                                                           >> PAGE_SHIFT));
1561                         offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1562                         info->foreign_map.map[offset] = pg;
1563                 }
1564         } else {
1565                 for (i = 0; i < nseg; i++) {
1566                         unsigned long uvaddr;
1567                         unsigned long offset;
1568                         struct page *pg;
1569
1570                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1571
1572                         gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
1573
1574                         if (unlikely(map[i].status != GNTST_okay)) {
1575                                 WPRINTK("invalid kernel buffer -- could not remap it\n");
1576                                 ret = 1;
1577                                 map[i].handle = INVALID_GRANT_HANDLE;
1578                         }
1579
1580                         pending_handle(mmap_idx, pending_idx, i).kernel 
1581                                 = map[i].handle;
1582
1583                         if (ret)
1584                                 continue;
1585
1586                         offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1587                         pg = idx_to_page(mmap_idx, pending_idx, i);
1588                         info->foreign_map.map[offset] = pg;
1589                 }
1590         }
1591
1592         /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1593         info->idx_map[usr_idx].mem = mmap_idx;
1594         info->idx_map[usr_idx].req = pending_idx;
1595
1596         spin_unlock(&info->map_lock);
1597
1598         if (ret)
1599                 goto fail_flush;
1600
1601         if (xen_feature(XENFEAT_auto_translated_physmap)) {
1602                 for (i = 0; i < nseg; i++) {
1603                         struct page *pg = idx_to_page(mmap_idx, pending_idx, i);
1604                         unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
1605                                                           usr_idx, i);
1606                         if (vma && uvaddr >= vma->vm_end) {
1607                                 vma = vma->vm_next;
1608                                 if (vma &&
1609                                     (uvaddr < vma->vm_start ||
1610                                      uvaddr >= vma->vm_end))
1611                                         vma = NULL;
1612                         }
1613                         if (vma == NULL) {
1614                                 vma = find_vma(mm, uvaddr);
1615                                 /* this virtual area was already munmapped.
1616                                    so skip to next page */
1617                                 if (!vma)
1618                                         continue;
1619                         }
1620                         ret = vm_insert_page(vma, uvaddr, pg);
1621                         if (ret)
1622                                 goto fail_flush;
1623                 }
1624         }
1625         
1626         up_read(&mm->mmap_sem);
1627
1628         blkif_get(blkif);
1629         /* Finally, write the request message to the user ring. */
1630         target = RING_GET_REQUEST(&info->ufe_ring,
1631                                   info->ufe_ring.req_prod_pvt);
1632         memcpy(target, req, sizeof(*req));
1633         target->id = usr_idx;
1634         wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1635         info->ufe_ring.req_prod_pvt++;
1636
1637         switch (req->operation) {
1638         case BLKIF_OP_READ:
1639                 blkif->st_rd_sect += nr_sects;
1640                 break;
1641         case BLKIF_OP_WRITE:
1642         case BLKIF_OP_WRITE_BARRIER:
1643                 blkif->st_wr_sect += nr_sects;
1644                 break;
1645         }
1646
1647         return;
1648
1649  fail_flush:
1650         up_read(&mm->mmap_sem);
1651         WPRINTK("Reached Fail_flush\n");
1652         fast_flush_area(pending_req, pending_idx, usr_idx, info);
1653  fail_response:
1654         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1655         free_req(pending_req);
1656         msleep(1); /* back off a bit */
1657 }
1658
1659
1660
1661 /******************************************************************
1662  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1663  */
1664
1665
1666 static void make_response(blkif_t *blkif, u64 id,
1667                           unsigned short op, int st)
1668 {
1669         blkif_response_t  resp;
1670         unsigned long     flags;
1671         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1672         int more_to_do = 0;
1673         int notify;
1674
1675         resp.id        = id;
1676         resp.operation = op;
1677         resp.status    = st;
1678
1679         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1680         /* Place on the response ring for the relevant domain. */
1681         switch (blkif->blk_protocol) {
1682         case BLKIF_PROTOCOL_NATIVE:
1683                 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1684                                          blk_rings->native.rsp_prod_pvt),
1685                        &resp, sizeof(resp));
1686                 break;
1687         case BLKIF_PROTOCOL_X86_32:
1688                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1689                                          blk_rings->x86_32.rsp_prod_pvt),
1690                        &resp, sizeof(resp));
1691                 break;
1692         case BLKIF_PROTOCOL_X86_64:
1693                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1694                                          blk_rings->x86_64.rsp_prod_pvt),
1695                        &resp, sizeof(resp));
1696                 break;
1697         default:
1698                 BUG();
1699         }
1700         blk_rings->common.rsp_prod_pvt++;
1701         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1702
1703         if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1704                 /*
1705                  * Tail check for pending requests. Allows frontend to avoid
1706                  * notifications if requests are already in flight (lower
1707                  * overheads and promotes batching).
1708                  */
1709                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1710         } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1711                 more_to_do = 1;
1712         }
1713
1714         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1715         if (more_to_do)
1716                 blkif_notify_work(blkif);
1717         if (notify)
1718                 notify_remote_via_irq(blkif->irq);
1719 }
1720
1721 static int __init blkif_init(void)
1722 {
1723         int i, ret;
1724
1725         if (!is_running_on_xen())
1726                 return -ENODEV;
1727
1728         INIT_LIST_HEAD(&pending_free);
1729         for(i = 0; i < 2; i++) {
1730                 ret = req_increase();
1731                 if (ret)
1732                         break;
1733         }
1734         if (i == 0)
1735                 return ret;
1736
1737         tap_blkif_interface_init();
1738
1739         alloc_pending_reqs = 0;
1740
1741         tap_blkif_xenbus_init();
1742
1743         /* Dynamically allocate a major for this device */
1744         ret = __register_chrdev(0, 0, MAX_TAP_DEV, "blktap", &blktap_fops);
1745
1746         if (ret < 0) {
1747                 WPRINTK("Couldn't register /dev/xen/blktap\n");
1748                 return -ENOMEM;
1749         }       
1750         
1751         blktap_major = ret;
1752
1753         /* tapfds[0] is always NULL */
1754         blktap_next_minor++;
1755
1756         DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret);
1757
1758         /* Make sure the xen class exists */
1759         if (get_xen_class()) {
1760                 /*
1761                  * This will allow udev to create the blktap ctrl device.
1762                  * We only want to create blktap0 first.  We don't want
1763                  * to flood the sysfs system with needless blktap devices.
1764                  * We only create the device when a request of a new device is
1765                  * made.
1766                  */
1767                 xen_class_device_create(&blktap_type, NULL,
1768                                         MKDEV(blktap_major, 0), NULL,
1769                                         "blktap0");
1770         } else {
1771                 /* this is bad, but not fatal */
1772                 WPRINTK("sysfs xen_class not created\n");
1773         }
1774
1775         DPRINTK("Blktap device successfully created\n");
1776
1777         return 0;
1778 }
1779
1780 module_init(blkif_init);
1781
1782 MODULE_LICENSE("Dual BSD/GPL");
1783 MODULE_ALIAS("devname:xen/blktap0");
1784 MODULE_ALIAS("xen-backend:tap");