- patches.rt/0001-sched-count-of-queued-RT-tasks.patch: Delete.
[linux-flexiantxendom0-3.2.10.git] / drivers / xen / blktap / blktap.c
1 /******************************************************************************
2  * drivers/xen/blktap/blktap.c
3  * 
4  * Back-end driver for user level virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. Requests
7  * are remapped to a user-space memory region.
8  *
9  * Based on the blkback driver code.
10  * 
11  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12  *
13  * Clean ups and fix ups:
14  *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
15  *
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License version 2
18  * as published by the Free Software Foundation; or, when distributed
19  * separately from the Linux kernel or incorporated into other
20  * software packages, subject to the following license:
21  * 
22  * Permission is hereby granted, free of charge, to any person obtaining a copy
23  * of this source file (the "Software"), to deal in the Software without
24  * restriction, including without limitation the rights to use, copy, modify,
25  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26  * and to permit persons to whom the Software is furnished to do so, subject to
27  * the following conditions:
28  * 
29  * The above copyright notice and this permission notice shall be included in
30  * all copies or substantial portions of the Software.
31  * 
32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38  * IN THE SOFTWARE.
39  */
40
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/freezer.h>
44 #include <linux/list.h>
45 #include <asm/hypervisor.h>
46 #include "common.h"
47 #include <xen/balloon.h>
48 #include <xen/driver_util.h>
49 #include <linux/kernel.h>
50 #include <linux/fs.h>
51 #include <linux/mm.h>
52 #include <linux/errno.h>
53 #include <linux/major.h>
54 #include <linux/gfp.h>
55 #include <linux/poll.h>
56 #include <linux/delay.h>
57 #include <asm/tlbflush.h>
58
59 #define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
60 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
61
62 /*
63  * The maximum number of requests that can be outstanding at any time
64  * is determined by 
65  *
66  *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
67  *
68  * where mmap_alloc < MAX_DYNAMIC_MEM.
69  *
70  * TODO:
71  * mmap_alloc is initialised to 2 and should be adjustable on the fly via
72  * sysfs.
73  */
74 #define BLK_RING_SIZE           __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
75 #define MAX_DYNAMIC_MEM         BLK_RING_SIZE
76 #define MAX_PENDING_REQS        BLK_RING_SIZE
77 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
78 #define MMAP_VADDR(_start, _req,_seg)                                   \
79         (_start +                                                       \
80          ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
81          ((_seg) * PAGE_SIZE))
82 static int blkif_reqs = MAX_PENDING_REQS;
83 static int mmap_pages = MMAP_PAGES;
84
85 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
86                       * have a bunch of pages reserved for shared
87                       * memory rings.
88                       */
89
90 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
91 typedef struct domid_translate {
92         unsigned short domid;
93         unsigned short busid;
94 } domid_translate_t ;
95
96 /*Data struct associated with each of the tapdisk devices*/
97 typedef struct tap_blkif {
98         struct vm_area_struct *vma;   /*Shared memory area                   */
99         unsigned long rings_vstart;   /*Kernel memory mapping                */
100         unsigned long user_vstart;    /*User memory mapping                  */
101         unsigned long dev_inuse;      /*One process opens device at a time.  */
102         unsigned long dev_pending;    /*In process of being opened           */
103         unsigned long ring_ok;        /*make this ring->state                */
104         blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
105         wait_queue_head_t wait;       /*for poll                             */
106         unsigned long mode;           /*current switching mode               */
107         int minor;                    /*Minor number for tapdisk device      */
108         pid_t pid;                    /*tapdisk process id                   */
109         enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
110                                                   shutdown                   */
111         unsigned long *idx_map;       /*Record the user ring id to kern 
112                                         [req id, idx] tuple                  */
113         blkif_t *blkif;               /*Associate blkif with tapdev          */
114         struct domid_translate trans; /*Translation from domid to bus.       */
115 } tap_blkif_t;
116
117 static struct tap_blkif *tapfds[MAX_TAP_DEV];
118 static int blktap_next_minor;
119
120 module_param(blkif_reqs, int, 0);
121 /* Run-time switchable: /sys/module/blktap/parameters/ */
122 static unsigned int log_stats = 0;
123 static unsigned int debug_lvl = 0;
124 module_param(log_stats, int, 0644);
125 module_param(debug_lvl, int, 0644);
126
127 /*
128  * Each outstanding request that we've passed to the lower device layers has a 
129  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
130  * the pendcnt towards zero. When it hits zero, the specified domain has a 
131  * response queued for it, with the saved 'id' passed back.
132  */
133 typedef struct {
134         blkif_t       *blkif;
135         u64            id;
136         unsigned short mem_idx;
137         int            nr_pages;
138         atomic_t       pendcnt;
139         unsigned short operation;
140         int            status;
141         struct list_head free_list;
142         int            inuse;
143 } pending_req_t;
144
145 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
146 static struct list_head pending_free;
147 static DEFINE_SPINLOCK(pending_free_lock);
148 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
149 static int alloc_pending_reqs;
150
151 typedef unsigned int PEND_RING_IDX;
152
153 static inline int MASK_PEND_IDX(int i) { 
154         return (i & (MAX_PENDING_REQS-1));
155 }
156
157 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
158         return (req - pending_reqs[idx]);
159 }
160
161 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
162
163 #define BLKBACK_INVALID_HANDLE (~0)
164
165 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
166 static inline unsigned long idx_to_kaddr(
167         unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
168 {
169         unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
170         unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
171         return (unsigned long)pfn_to_kaddr(pfn);
172 }
173
174 static unsigned short mmap_alloc = 0;
175 static unsigned short mmap_lock = 0;
176 static unsigned short mmap_inuse = 0;
177
178 /******************************************************************
179  * GRANT HANDLES
180  */
181
182 /* When using grant tables to map a frame for device access then the
183  * handle returned must be used to unmap the frame. This is needed to
184  * drop the ref count on the frame.
185  */
186 struct grant_handle_pair
187 {
188         grant_handle_t kernel;
189         grant_handle_t user;
190 };
191 #define INVALID_GRANT_HANDLE    0xFFFF
192
193 static struct grant_handle_pair 
194     pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
195 #define pending_handle(_id, _idx, _i) \
196     (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
197     + (_i)])
198
199
200 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
201
202 #define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
203 #define BLKTAP_DEV_DIR  "/dev/xen"
204
205 static int blktap_major;
206
207 /* blktap IOCTLs: */
208 #define BLKTAP_IOCTL_KICK_FE         1
209 #define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
210 #define BLKTAP_IOCTL_SETMODE         3
211 #define BLKTAP_IOCTL_SENDPID         4
212 #define BLKTAP_IOCTL_NEWINTF         5
213 #define BLKTAP_IOCTL_MINOR           6
214 #define BLKTAP_IOCTL_MAJOR           7
215 #define BLKTAP_QUERY_ALLOC_REQS      8
216 #define BLKTAP_IOCTL_FREEINTF        9
217 #define BLKTAP_IOCTL_PRINT_IDXS      100  
218
219 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
220 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
221 #define BLKTAP_MODE_INTERCEPT_FE     0x00000001
222 #define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
223
224 #define BLKTAP_MODE_INTERPOSE \
225            (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
226
227
228 static inline int BLKTAP_MODE_VALID(unsigned long arg)
229 {
230         return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
231                 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
232                 (arg == BLKTAP_MODE_INTERPOSE   ));
233 }
234
235 /* Requests passing through the tap to userspace are re-assigned an ID.
236  * We must record a mapping between the BE [IDX,ID] tuple and the userspace
237  * ring ID. 
238  */
239
240 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
241 {
242         return ((fe_dom << 16) | MASK_PEND_IDX(idx));
243 }
244
245 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
246 {
247         return (PEND_RING_IDX)(id & 0x0000ffff);
248 }
249
250 extern inline int ID_TO_MIDX(unsigned long id)
251 {
252         return (int)(id >> 16);
253 }
254
255 #define INVALID_REQ 0xdead0000
256
257 /*TODO: Convert to a free list*/
258 static inline int GET_NEXT_REQ(unsigned long *idx_map)
259 {
260         int i;
261         for (i = 0; i < MAX_PENDING_REQS; i++)
262                 if (idx_map[i] == INVALID_REQ)
263                         return i;
264
265         return INVALID_REQ;
266 }
267
268 static inline int OFFSET_TO_USR_IDX(int offset)
269 {
270         return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
271 }
272
273 static inline int OFFSET_TO_SEG(int offset)
274 {
275         return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
276 }
277
278
279 #define BLKTAP_INVALID_HANDLE(_g) \
280     (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
281      ((_g->user) == INVALID_GRANT_HANDLE))
282
283 #define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
284     (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
285     } while(0)
286
287
288 /******************************************************************
289  * BLKTAP VM OPS
290  */
291
292 static struct page *blktap_nopage(struct vm_area_struct *vma,
293                                   unsigned long address,
294                                   int *type)
295 {
296         /*
297          * if the page has not been mapped in by the driver then return
298          * NOPAGE_SIGBUS to the domain.
299          */
300
301         return NOPAGE_SIGBUS;
302 }
303
304 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
305                               unsigned long uvaddr,
306                               pte_t *ptep, int is_fullmm)
307 {
308         pte_t copy;
309         tap_blkif_t *info;
310         int offset, seg, usr_idx, pending_idx, mmap_idx;
311         unsigned long uvstart = vma->vm_start + (RING_PAGES << PAGE_SHIFT);
312         unsigned long kvaddr;
313         struct page **map;
314         struct page *pg;
315         struct grant_handle_pair *khandle;
316         struct gnttab_unmap_grant_ref unmap[2];
317         int count = 0;
318
319         /*
320          * If the address is before the start of the grant mapped region or
321          * if vm_file is NULL (meaning mmap failed and we have nothing to do)
322          */
323         if (uvaddr < uvstart || vma->vm_file == NULL)
324                 return ptep_get_and_clear_full(vma->vm_mm, uvaddr, 
325                                                ptep, is_fullmm);
326
327         info = vma->vm_file->private_data;
328         map = vma->vm_private_data;
329
330         /* TODO Should these be changed to if statements? */
331         BUG_ON(!info);
332         BUG_ON(!info->idx_map);
333         BUG_ON(!map);
334
335         offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
336         usr_idx = OFFSET_TO_USR_IDX(offset);
337         seg = OFFSET_TO_SEG(offset);
338
339         pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
340         mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
341
342         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
343         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
344         ClearPageReserved(pg);
345         map[offset + RING_PAGES] = NULL;
346
347         khandle = &pending_handle(mmap_idx, pending_idx, seg);
348
349         if (khandle->kernel != INVALID_GRANT_HANDLE) {
350                 gnttab_set_unmap_op(&unmap[count], kvaddr, 
351                                     GNTMAP_host_map, khandle->kernel);
352                 count++;
353
354                 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
355                                     INVALID_P2M_ENTRY);
356         }
357
358         if (khandle->user != INVALID_GRANT_HANDLE) {
359                 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
360
361                 copy = *ptep;
362                 gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
363                                     GNTMAP_host_map 
364                                     | GNTMAP_application_map 
365                                     | GNTMAP_contains_pte,
366                                     khandle->user);
367                 count++;
368         } else {
369                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
370
371                 /* USING SHADOW PAGE TABLES. */
372                 copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
373                                                is_fullmm);
374         }
375
376         if (count) {
377                 BLKTAP_INVALIDATE_HANDLE(khandle);
378                 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
379                                               unmap, count))
380                         BUG();
381         }
382
383         return copy;
384 }
385
386 struct vm_operations_struct blktap_vm_ops = {
387         nopage:   blktap_nopage,
388         zap_pte:  blktap_clear_pte,
389 };
390
391 /******************************************************************
392  * BLKTAP FILE OPS
393  */
394  
395 /*Function Declarations*/
396 static tap_blkif_t *get_next_free_dev(void);
397 static int blktap_open(struct inode *inode, struct file *filp);
398 static int blktap_release(struct inode *inode, struct file *filp);
399 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
400 static int blktap_ioctl(struct inode *inode, struct file *filp,
401                         unsigned int cmd, unsigned long arg);
402 static unsigned int blktap_poll(struct file *file, poll_table *wait);
403
404 static const struct file_operations blktap_fops = {
405         .owner   = THIS_MODULE,
406         .poll    = blktap_poll,
407         .ioctl   = blktap_ioctl,
408         .open    = blktap_open,
409         .release = blktap_release,
410         .mmap    = blktap_mmap,
411 };
412
413
414 static tap_blkif_t *get_next_free_dev(void)
415 {
416         struct class *class;
417         tap_blkif_t *info;
418         int minor;
419
420         /*
421          * This is called only from the ioctl, which
422          * means we should always have interrupts enabled.
423          */
424         BUG_ON(irqs_disabled());
425
426         spin_lock_irq(&pending_free_lock);
427
428         /* tapfds[0] is always NULL */
429
430         for (minor = 1; minor < blktap_next_minor; minor++) {
431                 info = tapfds[minor];
432                 /* we could have failed a previous attempt. */
433                 if (!info ||
434                     ((info->dev_inuse == 0) &&
435                      (info->dev_pending == 0)) ) {
436                         info->dev_pending = 1;
437                         goto found;
438                 }
439         }
440         info = NULL;
441         minor = -1;
442
443         /*
444          * We didn't find free device. If we can still allocate
445          * more, then we grab the next device minor that is
446          * available.  This is done while we are still under
447          * the protection of the pending_free_lock.
448          */
449         if (blktap_next_minor < MAX_TAP_DEV)
450                 minor = blktap_next_minor++;
451 found:
452         spin_unlock_irq(&pending_free_lock);
453
454         if (!info && minor > 0) {
455                 info = kzalloc(sizeof(*info), GFP_KERNEL);
456                 if (unlikely(!info)) {
457                         /*
458                          * If we failed here, try to put back
459                          * the next minor number. But if one
460                          * was just taken, then we just lose this
461                          * minor.  We can try to allocate this
462                          * minor again later.
463                          */
464                         spin_lock_irq(&pending_free_lock);
465                         if (blktap_next_minor == minor+1)
466                                 blktap_next_minor--;
467                         spin_unlock_irq(&pending_free_lock);
468                         goto out;
469                 }
470
471                 info->minor = minor;
472                 /*
473                  * Make sure that we have a minor before others can
474                  * see us.
475                  */
476                 wmb();
477                 tapfds[minor] = info;
478
479                 if ((class = get_xen_class()) != NULL)
480                         class_device_create(class, NULL,
481                                             MKDEV(blktap_major, minor), NULL,
482                                             "blktap%d", minor);
483         }
484
485 out:
486         return info;
487 }
488
489 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
490 {
491         tap_blkif_t *info;
492         int i;
493
494         for (i = 1; i < blktap_next_minor; i++) {
495                 info = tapfds[i];
496                 if ( info &&
497                      (info->trans.domid == domid) &&
498                      (info->trans.busid == xenbus_id) ) {
499                         info->blkif = blkif;
500                         info->status = RUNNING;
501                         return i;
502                 }
503         }
504         return -1;
505 }
506
507 void signal_tapdisk(int idx) 
508 {
509         tap_blkif_t *info;
510         struct task_struct *ptask;
511
512         info = tapfds[idx];
513         if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
514                 return;
515
516         if (info->pid > 0) {
517                 ptask = find_task_by_pid(info->pid);
518                 if (ptask)
519                         info->status = CLEANSHUTDOWN;
520         }
521         info->blkif = NULL;
522
523         return;
524 }
525
526 static int blktap_open(struct inode *inode, struct file *filp)
527 {
528         blkif_sring_t *sring;
529         int idx = iminor(inode) - BLKTAP_MINOR;
530         tap_blkif_t *info;
531         int i;
532         
533         /* ctrl device, treat differently */
534         if (!idx)
535                 return 0;
536
537         info = tapfds[idx];
538
539         if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
540                 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
541                         idx);
542                 return -ENODEV;
543         }
544
545         DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
546         
547         /*Only one process can access device at a time*/
548         if (test_and_set_bit(0, &info->dev_inuse))
549                 return -EBUSY;
550
551         info->dev_pending = 0;
552             
553         /* Allocate the fe ring. */
554         sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
555         if (sring == NULL)
556                 goto fail_nomem;
557
558         SetPageReserved(virt_to_page(sring));
559     
560         SHARED_RING_INIT(sring);
561         FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
562         
563         filp->private_data = info;
564         info->vma = NULL;
565
566         info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
567                                 GFP_KERNEL);
568         
569         if (info->idx_map == NULL)
570                 goto fail_nomem;
571
572         if (idx > 0) {
573                 init_waitqueue_head(&info->wait);
574                 for (i = 0; i < MAX_PENDING_REQS; i++) 
575                         info->idx_map[i] = INVALID_REQ;
576         }
577
578         DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
579         return 0;
580
581  fail_nomem:
582         return -ENOMEM;
583 }
584
585 static int blktap_release(struct inode *inode, struct file *filp)
586 {
587         tap_blkif_t *info = filp->private_data;
588         
589         /* check for control device */
590         if (!info)
591                 return 0;
592
593         info->dev_inuse = 0;
594         DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
595
596         /* Free the ring page. */
597         ClearPageReserved(virt_to_page(info->ufe_ring.sring));
598         free_page((unsigned long) info->ufe_ring.sring);
599
600         /* Clear any active mappings and free foreign map table */
601         if (info->vma) {
602                 zap_page_range(
603                         info->vma, info->vma->vm_start, 
604                         info->vma->vm_end - info->vma->vm_start, NULL);
605
606                 kfree(info->vma->vm_private_data);
607
608                 info->vma = NULL;
609         }
610
611         if (info->idx_map) {
612                 kfree(info->idx_map);
613                 info->idx_map = NULL;
614         }
615
616         if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
617                 if (info->blkif->xenblkd != NULL) {
618                         kthread_stop(info->blkif->xenblkd);
619                         info->blkif->xenblkd = NULL;
620                 }
621                 info->status = CLEANSHUTDOWN;
622         }
623
624         return 0;
625 }
626
627
628 /* Note on mmap:
629  * We need to map pages to user space in a way that will allow the block
630  * subsystem set up direct IO to them.  This couldn't be done before, because
631  * there isn't really a sane way to translate a user virtual address down to a 
632  * physical address when the page belongs to another domain.
633  *
634  * My first approach was to map the page in to kernel memory, add an entry
635  * for it in the physical frame list (using alloc_lomem_region as in blkback)
636  * and then attempt to map that page up to user space.  This is disallowed
637  * by xen though, which realizes that we don't really own the machine frame
638  * underlying the physical page.
639  *
640  * The new approach is to provide explicit support for this in xen linux.
641  * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
642  * mapped from other vms.  vma->vm_private_data is set up as a mapping 
643  * from pages to actual page structs.  There is a new clause in get_user_pages
644  * that does the right thing for this sort of mapping.
645  */
646 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
647 {
648         int size;
649         struct page **map;
650         int i;
651         tap_blkif_t *info = filp->private_data;
652         int ret;
653
654         if (info == NULL) {
655                 WPRINTK("blktap: mmap, retrieving idx failed\n");
656                 return -ENOMEM;
657         }
658         
659         vma->vm_flags |= VM_RESERVED;
660         vma->vm_ops = &blktap_vm_ops;
661
662         size = vma->vm_end - vma->vm_start;
663         if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
664                 WPRINTK("you _must_ map exactly %d pages!\n",
665                        mmap_pages + RING_PAGES);
666                 return -EAGAIN;
667         }
668
669         size >>= PAGE_SHIFT;
670         info->rings_vstart = vma->vm_start;
671         info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
672     
673         /* Map the ring pages to the start of the region and reserve it. */
674         if (xen_feature(XENFEAT_auto_translated_physmap))
675                 ret = vm_insert_page(vma, vma->vm_start,
676                                      virt_to_page(info->ufe_ring.sring));
677         else
678                 ret = remap_pfn_range(vma, vma->vm_start,
679                                       __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
680                                       PAGE_SIZE, vma->vm_page_prot);
681         if (ret) {
682                 WPRINTK("Mapping user ring failed!\n");
683                 goto fail;
684         }
685
686         /* Mark this VM as containing foreign pages, and set up mappings. */
687         map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
688                       * sizeof(struct page_struct*),
689                       GFP_KERNEL);
690         if (map == NULL) {
691                 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
692                 goto fail;
693         }
694
695         for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
696                 map[i] = NULL;
697     
698         vma->vm_private_data = map;
699         vma->vm_flags |= VM_FOREIGN;
700         vma->vm_flags |= VM_DONTCOPY;
701
702 #ifdef CONFIG_X86
703         vma->vm_mm->context.has_foreign_mappings = 1;
704 #endif
705
706         info->vma = vma;
707         info->ring_ok = 1;
708         return 0;
709  fail:
710         /* Clear any active mappings. */
711         zap_page_range(vma, vma->vm_start, 
712                        vma->vm_end - vma->vm_start, NULL);
713
714         return -ENOMEM;
715 }
716
717
718 static int blktap_ioctl(struct inode *inode, struct file *filp,
719                         unsigned int cmd, unsigned long arg)
720 {
721         tap_blkif_t *info = filp->private_data;
722
723         switch(cmd) {
724         case BLKTAP_IOCTL_KICK_FE: 
725         {
726                 /* There are fe messages to process. */
727                 return blktap_read_ufe_ring(info);
728         }
729         case BLKTAP_IOCTL_SETMODE:
730         {
731                 if (info) {
732                         if (BLKTAP_MODE_VALID(arg)) {
733                                 info->mode = arg;
734                                 /* XXX: may need to flush rings here. */
735                                 DPRINTK("blktap: set mode to %lx\n", 
736                                        arg);
737                                 return 0;
738                         }
739                 }
740                 return 0;
741         }
742         case BLKTAP_IOCTL_PRINT_IDXS:
743         {
744                 if (info) {
745                         printk("User Rings: \n-----------\n");
746                         printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
747                                 "| req_prod: %2d, rsp_prod: %2d\n",
748                                 info->ufe_ring.rsp_cons,
749                                 info->ufe_ring.req_prod_pvt,
750                                 info->ufe_ring.sring->req_prod,
751                                 info->ufe_ring.sring->rsp_prod);
752                 }
753                 return 0;
754         }
755         case BLKTAP_IOCTL_SENDPID:
756         {
757                 if (info) {
758                         info->pid = (pid_t)arg;
759                         DPRINTK("blktap: pid received %d\n", 
760                                info->pid);
761                 }
762                 return 0;
763         }
764         case BLKTAP_IOCTL_NEWINTF:
765         {               
766                 uint64_t val = (uint64_t)arg;
767                 domid_translate_t *tr = (domid_translate_t *)&val;
768
769                 DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
770                        tr->domid, tr->busid);
771                 info = get_next_free_dev();
772                 if (!info) {
773                         WPRINTK("Error initialising /dev/xen/blktap - "
774                                 "No more devices\n");
775                         return -1;
776                 }
777                 info->trans.domid = tr->domid;
778                 info->trans.busid = tr->busid;
779                 return info->minor;
780         }
781         case BLKTAP_IOCTL_FREEINTF:
782         {
783                 unsigned long dev = arg;
784                 unsigned long flags;
785
786                 info = tapfds[dev];
787
788                 if ((dev > MAX_TAP_DEV) || !info)
789                         return 0; /* should this be an error? */
790
791                 spin_lock_irqsave(&pending_free_lock, flags);
792                 if (info->dev_pending)
793                         info->dev_pending = 0;
794                 spin_unlock_irqrestore(&pending_free_lock, flags);
795
796                 return 0;
797         }
798         case BLKTAP_IOCTL_MINOR:
799         {
800                 unsigned long dev = arg;
801
802                 info = tapfds[dev];
803
804                 if ((dev > MAX_TAP_DEV) || !info)
805                         return -EINVAL;
806
807                 return info->minor;
808         }
809         case BLKTAP_IOCTL_MAJOR:
810                 return blktap_major;
811
812         case BLKTAP_QUERY_ALLOC_REQS:
813         {
814                 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
815                        alloc_pending_reqs, blkif_reqs);
816                 return (alloc_pending_reqs/blkif_reqs) * 100;
817         }
818         }
819         return -ENOIOCTLCMD;
820 }
821
822 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
823 {
824         tap_blkif_t *info = filp->private_data;
825         
826         /* do not work on the control device */
827         if (!info)
828                 return 0;
829
830         poll_wait(filp, &info->wait, wait);
831         if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
832                 RING_PUSH_REQUESTS(&info->ufe_ring);
833                 return POLLIN | POLLRDNORM;
834         }
835         return 0;
836 }
837
838 void blktap_kick_user(int idx)
839 {
840         tap_blkif_t *info;
841
842         info = tapfds[idx];
843
844         if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
845                 return;
846
847         wake_up_interruptible(&info->wait);
848
849         return;
850 }
851
852 static int do_block_io_op(blkif_t *blkif);
853 static void dispatch_rw_block_io(blkif_t *blkif,
854                                  blkif_request_t *req,
855                                  pending_req_t *pending_req);
856 static void make_response(blkif_t *blkif, u64 id,
857                           unsigned short op, int st);
858
859 /******************************************************************
860  * misc small helpers
861  */
862 static int req_increase(void)
863 {
864         int i, j;
865
866         if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
867                 return -EINVAL;
868
869         pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
870                                             * blkif_reqs, GFP_KERNEL);
871         foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
872
873         if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
874                 goto out_of_memory;
875
876         DPRINTK("%s: reqs=%d, pages=%d\n",
877                 __FUNCTION__, blkif_reqs, mmap_pages);
878
879         for (i = 0; i < MAX_PENDING_REQS; i++) {
880                 list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
881                               &pending_free);
882                 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
883                 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
884                         BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
885                                                                  i, j));
886         }
887
888         mmap_alloc++;
889         DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
890         return 0;
891
892  out_of_memory:
893         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
894         kfree(pending_reqs[mmap_alloc]);
895         WPRINTK("%s: out of memory\n", __FUNCTION__);
896         return -ENOMEM;
897 }
898
899 static void mmap_req_del(int mmap)
900 {
901         BUG_ON(!spin_is_locked(&pending_free_lock));
902
903         kfree(pending_reqs[mmap]);
904         pending_reqs[mmap] = NULL;
905
906         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
907         foreign_pages[mmap] = NULL;
908
909         mmap_lock = 0;
910         DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
911         mmap_alloc--;
912 }
913
914 static pending_req_t* alloc_req(void)
915 {
916         pending_req_t *req = NULL;
917         unsigned long flags;
918
919         spin_lock_irqsave(&pending_free_lock, flags);
920
921         if (!list_empty(&pending_free)) {
922                 req = list_entry(pending_free.next, pending_req_t, free_list);
923                 list_del(&req->free_list);
924         }
925
926         if (req) {
927                 req->inuse = 1;
928                 alloc_pending_reqs++;
929         }
930         spin_unlock_irqrestore(&pending_free_lock, flags);
931
932         return req;
933 }
934
935 static void free_req(pending_req_t *req)
936 {
937         unsigned long flags;
938         int was_empty;
939
940         spin_lock_irqsave(&pending_free_lock, flags);
941
942         alloc_pending_reqs--;
943         req->inuse = 0;
944         if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
945                 mmap_inuse--;
946                 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
947                 spin_unlock_irqrestore(&pending_free_lock, flags);
948                 return;
949         }
950         was_empty = list_empty(&pending_free);
951         list_add(&req->free_list, &pending_free);
952
953         spin_unlock_irqrestore(&pending_free_lock, flags);
954
955         if (was_empty)
956                 wake_up(&pending_free_wq);
957 }
958
959 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
960                             int tapidx)
961 {
962         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
963         unsigned int i, invcount = 0;
964         struct grant_handle_pair *khandle;
965         uint64_t ptep;
966         int ret, mmap_idx;
967         unsigned long kvaddr, uvaddr;
968         tap_blkif_t *info;
969         
970
971         info = tapfds[tapidx];
972
973         if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
974                 WPRINTK("fast_flush: Couldn't get info!\n");
975                 return;
976         }
977
978         if (info->vma != NULL &&
979             xen_feature(XENFEAT_auto_translated_physmap)) {
980                 down_write(&info->vma->vm_mm->mmap_sem);
981                 zap_page_range(info->vma, 
982                                MMAP_VADDR(info->user_vstart, u_idx, 0), 
983                                req->nr_pages << PAGE_SHIFT, NULL);
984                 up_write(&info->vma->vm_mm->mmap_sem);
985                 return;
986         }
987
988         mmap_idx = req->mem_idx;
989
990         for (i = 0; i < req->nr_pages; i++) {
991                 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
992                 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
993
994                 khandle = &pending_handle(mmap_idx, k_idx, i);
995
996                 if (khandle->kernel != INVALID_GRANT_HANDLE) {
997                         gnttab_set_unmap_op(&unmap[invcount],
998                                             idx_to_kaddr(mmap_idx, k_idx, i),
999                                             GNTMAP_host_map, khandle->kernel);
1000                         invcount++;
1001
1002                         set_phys_to_machine(
1003                                 __pa(idx_to_kaddr(mmap_idx, k_idx, i))
1004                                 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
1005                 }
1006
1007                 if (khandle->user != INVALID_GRANT_HANDLE) {
1008                         BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1009                         if (create_lookup_pte_addr(
1010                                 info->vma->vm_mm,
1011                                 MMAP_VADDR(info->user_vstart, u_idx, i),
1012                                 &ptep) !=0) {
1013                                 WPRINTK("Couldn't get a pte addr!\n");
1014                                 return;
1015                         }
1016
1017                         gnttab_set_unmap_op(&unmap[invcount], ptep,
1018                                             GNTMAP_host_map
1019                                             | GNTMAP_application_map
1020                                             | GNTMAP_contains_pte,
1021                                             khandle->user);
1022                         invcount++;
1023                 }
1024
1025                 BLKTAP_INVALIDATE_HANDLE(khandle);
1026         }
1027         ret = HYPERVISOR_grant_table_op(
1028                 GNTTABOP_unmap_grant_ref, unmap, invcount);
1029         BUG_ON(ret);
1030         
1031         if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
1032                 zap_page_range(info->vma, 
1033                                MMAP_VADDR(info->user_vstart, u_idx, 0), 
1034                                req->nr_pages << PAGE_SHIFT, NULL);
1035 }
1036
1037 /******************************************************************
1038  * SCHEDULER FUNCTIONS
1039  */
1040
1041 static void print_stats(blkif_t *blkif)
1042 {
1043         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
1044                current->comm, blkif->st_oo_req,
1045                blkif->st_rd_req, blkif->st_wr_req);
1046         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1047         blkif->st_rd_req = 0;
1048         blkif->st_wr_req = 0;
1049         blkif->st_oo_req = 0;
1050 }
1051
1052 int tap_blkif_schedule(void *arg)
1053 {
1054         blkif_t *blkif = arg;
1055
1056         blkif_get(blkif);
1057
1058         if (debug_lvl)
1059                 printk(KERN_DEBUG "%s: started\n", current->comm);
1060
1061         while (!kthread_should_stop()) {
1062                 if (try_to_freeze())
1063                         continue;
1064
1065                 wait_event_interruptible(
1066                         blkif->wq,
1067                         blkif->waiting_reqs || kthread_should_stop());
1068                 wait_event_interruptible(
1069                         pending_free_wq,
1070                         !list_empty(&pending_free) || kthread_should_stop());
1071
1072                 blkif->waiting_reqs = 0;
1073                 smp_mb(); /* clear flag *before* checking for work */
1074
1075                 if (do_block_io_op(blkif))
1076                         blkif->waiting_reqs = 1;
1077
1078                 if (log_stats && time_after(jiffies, blkif->st_print))
1079                         print_stats(blkif);
1080         }
1081
1082         if (log_stats)
1083                 print_stats(blkif);
1084         if (debug_lvl)
1085                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1086
1087         blkif->xenblkd = NULL;
1088         blkif_put(blkif);
1089
1090         return 0;
1091 }
1092
1093 /******************************************************************
1094  * COMPLETION CALLBACK -- Called by user level ioctl()
1095  */
1096
1097 static int blktap_read_ufe_ring(tap_blkif_t *info)
1098 {
1099         /* This is called to read responses from the UFE ring. */
1100         RING_IDX i, j, rp;
1101         blkif_response_t *resp;
1102         blkif_t *blkif=NULL;
1103         int pending_idx, usr_idx, mmap_idx;
1104         pending_req_t *pending_req;
1105         
1106         if (!info)
1107                 return 0;
1108
1109         /* We currently only forward packets in INTERCEPT_FE mode. */
1110         if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1111                 return 0;
1112
1113         /* for each outstanding message on the UFEring  */
1114         rp = info->ufe_ring.sring->rsp_prod;
1115         rmb();
1116         
1117         for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1118                 blkif_response_t res;
1119                 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1120                 memcpy(&res, resp, sizeof(res));
1121                 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1122                 ++info->ufe_ring.rsp_cons;
1123
1124                 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1125                 usr_idx = (int)res.id;
1126                 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1127                 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1128
1129                 if ( (mmap_idx >= mmap_alloc) || 
1130                    (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1131                         WPRINTK("Incorrect req map"
1132                                "[%d], internal map [%d,%d (%d)]\n", 
1133                                usr_idx, mmap_idx, 
1134                                ID_TO_IDX(info->idx_map[usr_idx]),
1135                                MASK_PEND_IDX(
1136                                        ID_TO_IDX(info->idx_map[usr_idx])));
1137
1138                 pending_req = &pending_reqs[mmap_idx][pending_idx];
1139                 blkif = pending_req->blkif;
1140
1141                 for (j = 0; j < pending_req->nr_pages; j++) {
1142
1143                         unsigned long kvaddr, uvaddr;
1144                         struct page **map = info->vma->vm_private_data;
1145                         struct page *pg;
1146                         int offset;
1147
1148                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1149                         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
1150
1151                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1152                         ClearPageReserved(pg);
1153                         offset = (uvaddr - info->vma->vm_start) 
1154                                 >> PAGE_SHIFT;
1155                         map[offset] = NULL;
1156                 }
1157                 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
1158                 info->idx_map[usr_idx] = INVALID_REQ;
1159                 make_response(blkif, pending_req->id, res.operation,
1160                               res.status);
1161                 blkif_put(pending_req->blkif);
1162                 free_req(pending_req);
1163         }
1164                 
1165         return 0;
1166 }
1167
1168
1169 /******************************************************************************
1170  * NOTIFICATION FROM GUEST OS.
1171  */
1172
1173 static void blkif_notify_work(blkif_t *blkif)
1174 {
1175         blkif->waiting_reqs = 1;
1176         wake_up(&blkif->wq);
1177 }
1178
1179 irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
1180 {
1181         blkif_notify_work(dev_id);
1182         return IRQ_HANDLED;
1183 }
1184
1185
1186
1187 /******************************************************************
1188  * DOWNWARD CALLS -- These interface with the block-device layer proper.
1189  */
1190 static int print_dbug = 1;
1191 static int do_block_io_op(blkif_t *blkif)
1192 {
1193         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1194         blkif_request_t req;
1195         pending_req_t *pending_req;
1196         RING_IDX rc, rp;
1197         int more_to_do = 0;
1198         tap_blkif_t *info;
1199
1200         rc = blk_rings->common.req_cons;
1201         rp = blk_rings->common.sring->req_prod;
1202         rmb(); /* Ensure we see queued requests up to 'rp'. */
1203
1204         /*Check blkif has corresponding UE ring*/
1205         if (blkif->dev_num < 0) {
1206                 /*oops*/
1207                 if (print_dbug) {
1208                         WPRINTK("Corresponding UE " 
1209                                "ring does not exist!\n");
1210                         print_dbug = 0; /*We only print this message once*/
1211                 }
1212                 return 0;
1213         }
1214
1215         info = tapfds[blkif->dev_num];
1216
1217         if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
1218                 if (print_dbug) {
1219                         WPRINTK("Can't get UE info!\n");
1220                         print_dbug = 0;
1221                 }
1222                 return 0;
1223         }
1224
1225         while (rc != rp) {
1226                 
1227                 if (RING_FULL(&info->ufe_ring)) {
1228                         WPRINTK("RING_FULL! More to do\n");
1229                         more_to_do = 1;
1230                         break;
1231                 }
1232
1233                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1234                         WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1235                                " More to do\n");
1236                         more_to_do = 1;
1237                         break;          
1238                 }
1239
1240                 pending_req = alloc_req();
1241                 if (NULL == pending_req) {
1242                         blkif->st_oo_req++;
1243                         more_to_do = 1;
1244                         break;
1245                 }
1246
1247                 if (kthread_should_stop()) {
1248                         more_to_do = 1;
1249                         break;
1250                 }
1251
1252                 switch (blkif->blk_protocol) {
1253                 case BLKIF_PROTOCOL_NATIVE:
1254                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1255                                sizeof(req));
1256                         break;
1257                 case BLKIF_PROTOCOL_X86_32:
1258                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1259                         break;
1260                 case BLKIF_PROTOCOL_X86_64:
1261                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1262                         break;
1263                 default:
1264                         BUG();
1265                 }
1266                 blk_rings->common.req_cons = ++rc; /* before make_response() */
1267
1268                 switch (req.operation) {
1269                 case BLKIF_OP_READ:
1270                         blkif->st_rd_req++;
1271                         dispatch_rw_block_io(blkif, &req, pending_req);
1272                         break;
1273
1274                 case BLKIF_OP_WRITE:
1275                         blkif->st_wr_req++;
1276                         dispatch_rw_block_io(blkif, &req, pending_req);
1277                         break;
1278
1279                 default:
1280                         /* A good sign something is wrong: sleep for a while to
1281                          * avoid excessive CPU consumption by a bad guest. */
1282                         msleep(1);
1283                         WPRINTK("unknown operation [%d]\n",
1284                                 req.operation);
1285                         make_response(blkif, req.id, req.operation,
1286                                       BLKIF_RSP_ERROR);
1287                         free_req(pending_req);
1288                         break;
1289                 }
1290
1291                 /* Yield point for this unbounded loop. */
1292                 cond_resched();
1293         }
1294                 
1295         blktap_kick_user(blkif->dev_num);
1296
1297         return more_to_do;
1298 }
1299
1300 static void dispatch_rw_block_io(blkif_t *blkif,
1301                                  blkif_request_t *req,
1302                                  pending_req_t *pending_req)
1303 {
1304         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1305         int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1306         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1307         unsigned int nseg;
1308         int ret, i, nr_sects = 0;
1309         tap_blkif_t *info;
1310         blkif_request_t *target;
1311         int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1312         int usr_idx;
1313         uint16_t mmap_idx = pending_req->mem_idx;
1314
1315         if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
1316                 goto fail_response;
1317
1318         info = tapfds[blkif->dev_num];
1319         if (info == NULL)
1320                 goto fail_response;
1321
1322         /* Check we have space on user ring - should never fail. */
1323         usr_idx = GET_NEXT_REQ(info->idx_map);
1324         if (usr_idx == INVALID_REQ) {
1325                 BUG();
1326                 goto fail_response;
1327         }
1328
1329         /* Check that number of segments is sane. */
1330         nseg = req->nr_segments;
1331         if ( unlikely(nseg == 0) || 
1332             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1333                 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1334                 goto fail_response;
1335         }
1336         
1337         /* Make sure userspace is ready. */
1338         if (!info->ring_ok) {
1339                 WPRINTK("blktap: ring not ready for requests!\n");
1340                 goto fail_response;
1341         }
1342
1343         if (RING_FULL(&info->ufe_ring)) {
1344                 WPRINTK("blktap: fe_ring is full, can't add "
1345                         "IO Request will be dropped. %d %d\n",
1346                         RING_SIZE(&info->ufe_ring),
1347                         RING_SIZE(&blkif->blk_rings.common));
1348                 goto fail_response;
1349         }
1350
1351         pending_req->blkif     = blkif;
1352         pending_req->id        = req->id;
1353         pending_req->operation = operation;
1354         pending_req->status    = BLKIF_RSP_OKAY;
1355         pending_req->nr_pages  = nseg;
1356         op = 0;
1357         for (i = 0; i < nseg; i++) {
1358                 unsigned long uvaddr;
1359                 unsigned long kvaddr;
1360                 uint64_t ptep;
1361                 uint32_t flags;
1362
1363                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1364                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1365
1366                 flags = GNTMAP_host_map;
1367                 if (operation == WRITE)
1368                         flags |= GNTMAP_readonly;
1369                 gnttab_set_map_op(&map[op], kvaddr, flags,
1370                                   req->seg[i].gref, blkif->domid);
1371                 op++;
1372
1373                 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1374                         /* Now map it to user. */
1375                         ret = create_lookup_pte_addr(info->vma->vm_mm, 
1376                                                      uvaddr, &ptep);
1377                         if (ret) {
1378                                 WPRINTK("Couldn't get a pte addr!\n");
1379                                 goto fail_flush;
1380                         }
1381
1382                         flags = GNTMAP_host_map | GNTMAP_application_map
1383                                 | GNTMAP_contains_pte;
1384                         if (operation == WRITE)
1385                                 flags |= GNTMAP_readonly;
1386                         gnttab_set_map_op(&map[op], ptep, flags,
1387                                           req->seg[i].gref, blkif->domid);
1388                         op++;
1389                 }
1390
1391                 nr_sects += (req->seg[i].last_sect - 
1392                              req->seg[i].first_sect + 1);
1393         }
1394
1395         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1396         BUG_ON(ret);
1397
1398         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1399                 for (i = 0; i < (nseg*2); i+=2) {
1400                         unsigned long uvaddr;
1401                         unsigned long kvaddr;
1402                         unsigned long offset;
1403                         struct page *pg;
1404
1405                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1406                         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1407
1408                         if (unlikely(map[i].status != 0)) {
1409                                 WPRINTK("invalid kernel buffer -- "
1410                                         "could not remap it\n");
1411                                 ret |= 1;
1412                                 map[i].handle = INVALID_GRANT_HANDLE;
1413                         }
1414
1415                         if (unlikely(map[i+1].status != 0)) {
1416                                 WPRINTK("invalid user buffer -- "
1417                                         "could not remap it\n");
1418                                 ret |= 1;
1419                                 map[i+1].handle = INVALID_GRANT_HANDLE;
1420                         }
1421
1422                         pending_handle(mmap_idx, pending_idx, i/2).kernel 
1423                                 = map[i].handle;
1424                         pending_handle(mmap_idx, pending_idx, i/2).user   
1425                                 = map[i+1].handle;
1426
1427                         if (ret)
1428                                 continue;
1429
1430                         set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1431                                             FOREIGN_FRAME(map[i].dev_bus_addr
1432                                                           >> PAGE_SHIFT));
1433                         offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1434                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1435                         ((struct page **)info->vma->vm_private_data)[offset] =
1436                                 pg;
1437                 }
1438         } else {
1439                 for (i = 0; i < nseg; i++) {
1440                         unsigned long uvaddr;
1441                         unsigned long kvaddr;
1442                         unsigned long offset;
1443                         struct page *pg;
1444
1445                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1446                         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1447
1448                         if (unlikely(map[i].status != 0)) {
1449                                 WPRINTK("invalid kernel buffer -- "
1450                                         "could not remap it\n");
1451                                 ret |= 1;
1452                                 map[i].handle = INVALID_GRANT_HANDLE;
1453                         }
1454
1455                         pending_handle(mmap_idx, pending_idx, i).kernel 
1456                                 = map[i].handle;
1457
1458                         if (ret)
1459                                 continue;
1460
1461                         offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1462                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1463                         ((struct page **)info->vma->vm_private_data)[offset] =
1464                                 pg;
1465                 }
1466         }
1467
1468         if (ret)
1469                 goto fail_flush;
1470
1471         if (xen_feature(XENFEAT_auto_translated_physmap))
1472                 down_write(&info->vma->vm_mm->mmap_sem);
1473         /* Mark mapped pages as reserved: */
1474         for (i = 0; i < req->nr_segments; i++) {
1475                 unsigned long kvaddr;
1476                 struct page *pg;
1477
1478                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1479                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1480                 SetPageReserved(pg);
1481                 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1482                         ret = vm_insert_page(info->vma,
1483                                              MMAP_VADDR(info->user_vstart,
1484                                                         usr_idx, i), pg);
1485                         if (ret) {
1486                                 up_write(&info->vma->vm_mm->mmap_sem);
1487                                 goto fail_flush;
1488                         }
1489                 }
1490         }
1491         if (xen_feature(XENFEAT_auto_translated_physmap))
1492                 up_write(&info->vma->vm_mm->mmap_sem);
1493         
1494         /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1495         info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1496
1497         blkif_get(blkif);
1498         /* Finally, write the request message to the user ring. */
1499         target = RING_GET_REQUEST(&info->ufe_ring,
1500                                   info->ufe_ring.req_prod_pvt);
1501         memcpy(target, req, sizeof(*req));
1502         target->id = usr_idx;
1503         wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1504         info->ufe_ring.req_prod_pvt++;
1505
1506         if (operation == READ)
1507                 blkif->st_rd_sect += nr_sects;
1508         else if (operation == WRITE)
1509                 blkif->st_wr_sect += nr_sects;
1510
1511         return;
1512
1513  fail_flush:
1514         WPRINTK("Reached Fail_flush\n");
1515         fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1516  fail_response:
1517         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1518         free_req(pending_req);
1519         msleep(1); /* back off a bit */
1520 }
1521
1522
1523
1524 /******************************************************************
1525  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1526  */
1527
1528
1529 static void make_response(blkif_t *blkif, u64 id,
1530                           unsigned short op, int st)
1531 {
1532         blkif_response_t  resp;
1533         unsigned long     flags;
1534         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1535         int more_to_do = 0;
1536         int notify;
1537
1538         resp.id        = id;
1539         resp.operation = op;
1540         resp.status    = st;
1541
1542         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1543         /* Place on the response ring for the relevant domain. */
1544         switch (blkif->blk_protocol) {
1545         case BLKIF_PROTOCOL_NATIVE:
1546                 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1547                                          blk_rings->native.rsp_prod_pvt),
1548                        &resp, sizeof(resp));
1549                 break;
1550         case BLKIF_PROTOCOL_X86_32:
1551                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1552                                          blk_rings->x86_32.rsp_prod_pvt),
1553                        &resp, sizeof(resp));
1554                 break;
1555         case BLKIF_PROTOCOL_X86_64:
1556                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1557                                          blk_rings->x86_64.rsp_prod_pvt),
1558                        &resp, sizeof(resp));
1559                 break;
1560         default:
1561                 BUG();
1562         }
1563         blk_rings->common.rsp_prod_pvt++;
1564         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1565
1566         if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1567                 /*
1568                  * Tail check for pending requests. Allows frontend to avoid
1569                  * notifications if requests are already in flight (lower
1570                  * overheads and promotes batching).
1571                  */
1572                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1573         } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1574                 more_to_do = 1;
1575         }
1576
1577         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1578         if (more_to_do)
1579                 blkif_notify_work(blkif);
1580         if (notify)
1581                 notify_remote_via_irq(blkif->irq);
1582 }
1583
1584 static int __init blkif_init(void)
1585 {
1586         int i, ret;
1587         struct class *class;
1588
1589         if (!is_running_on_xen())
1590                 return -ENODEV;
1591
1592         INIT_LIST_HEAD(&pending_free);
1593         for(i = 0; i < 2; i++) {
1594                 ret = req_increase();
1595                 if (ret)
1596                         break;
1597         }
1598         if (i == 0)
1599                 return ret;
1600
1601         tap_blkif_interface_init();
1602
1603         alloc_pending_reqs = 0;
1604
1605         tap_blkif_xenbus_init();
1606
1607         /* Dynamically allocate a major for this device */
1608         ret = register_chrdev(0, "blktap", &blktap_fops);
1609
1610         if (ret < 0) {
1611                 WPRINTK("Couldn't register /dev/xen/blktap\n");
1612                 return -ENOMEM;
1613         }       
1614         
1615         blktap_major = ret;
1616
1617         /* tapfds[0] is always NULL */
1618         blktap_next_minor++;
1619
1620         DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1621
1622         /* Make sure the xen class exists */
1623         if ((class = get_xen_class()) != NULL) {
1624                 /*
1625                  * This will allow udev to create the blktap ctrl device.
1626                  * We only want to create blktap0 first.  We don't want
1627                  * to flood the sysfs system with needless blktap devices.
1628                  * We only create the device when a request of a new device is
1629                  * made.
1630                  */
1631                 class_device_create(class, NULL,
1632                                     MKDEV(blktap_major, 0), NULL,
1633                                     "blktap0");
1634         } else {
1635                 /* this is bad, but not fatal */
1636                 WPRINTK("blktap: sysfs xen_class not created\n");
1637         }
1638
1639         DPRINTK("Blktap device successfully created\n");
1640
1641         return 0;
1642 }
1643
1644 module_init(blkif_init);
1645
1646 MODULE_LICENSE("Dual BSD/GPL");