1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
9 * Based on the blkback driver code.
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
13 * Clean ups and fix ups:
14 * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License version 2
18 * as published by the Free Software Foundation; or, when distributed
19 * separately from the Linux kernel or incorporated into other
20 * software packages, subject to the following license:
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this source file (the "Software"), to deal in the Software without
24 * restriction, including without limitation the rights to use, copy, modify,
25 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26 * and to permit persons to whom the Software is furnished to do so, subject to
27 * the following conditions:
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/freezer.h>
44 #include <linux/list.h>
45 #include <linux/module.h>
46 #include <asm/hypervisor.h>
48 #include <xen/balloon.h>
49 #include <xen/driver_util.h>
50 #include <xen/evtchn.h>
51 #include <xen/gnttab.h>
52 #include <linux/kernel.h>
55 #include <linux/errno.h>
56 #include <linux/major.h>
57 #include <linux/gfp.h>
58 #include <linux/poll.h>
59 #include <linux/delay.h>
60 #include <linux/nsproxy.h>
61 #include <asm/tlbflush.h>
63 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
64 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
67 * The maximum number of requests that can be outstanding at any time
70 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
72 * where mmap_alloc < MAX_DYNAMIC_MEM.
75 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
78 #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
79 #define MAX_DYNAMIC_MEM BLK_RING_SIZE
80 #define MAX_PENDING_REQS BLK_RING_SIZE
81 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
82 #define MMAP_VADDR(_start, _req,_seg) \
84 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
86 static int mmap_pages = MMAP_PAGES;
88 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
89 * have a bunch of pages reserved for shared
93 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
94 typedef struct domid_translate {
99 typedef struct domid_translate_ext {
100 unsigned short domid;
102 } domid_translate_ext_t ;
104 /*Data struct associated with each of the tapdisk devices*/
105 typedef struct tap_blkif {
106 struct mm_struct *mm; /*User address space */
107 unsigned long rings_vstart; /*Kernel memory mapping */
108 unsigned long user_vstart; /*User memory mapping */
109 unsigned long dev_inuse; /*One process opens device at a time. */
110 unsigned long dev_pending; /*In process of being opened */
111 unsigned long ring_ok; /*make this ring->state */
112 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
113 wait_queue_head_t wait; /*for poll */
114 unsigned long mode; /*current switching mode */
115 int minor; /*Minor number for tapdisk device */
116 pid_t pid; /*tapdisk process id */
117 struct pid_namespace *pid_ns; /*... and its corresponding namespace */
118 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
120 spinlock_t map_lock; /*protects idx_map */
123 } *idx_map; /*Record the user ring id to kern
124 [req id, idx] tuple */
125 blkif_t *blkif; /*Associate blkif with tapdev */
126 struct domid_translate_ext trans; /*Translation from domid to bus. */
127 struct vm_foreign_map foreign_map; /*Mapping page */
130 static struct tap_blkif *tapfds[MAX_TAP_DEV];
131 static int blktap_next_minor;
133 /* Run-time switchable: /sys/module/blktap/parameters/ */
134 static unsigned int log_stats = 0;
135 static unsigned int debug_lvl = 0;
136 module_param(log_stats, int, 0644);
137 module_param(debug_lvl, int, 0644);
140 * Each outstanding request that we've passed to the lower device layers has a
141 * 'pending_req' allocated to it.
146 unsigned short mem_idx;
147 unsigned short nr_pages;
148 struct list_head free_list;
151 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
152 static struct list_head pending_free;
153 static DEFINE_SPINLOCK(pending_free_lock);
154 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
155 static int alloc_pending_reqs;
157 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
158 return (req - pending_reqs[idx]);
161 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
163 #define BLKBACK_INVALID_HANDLE (~0)
165 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
166 static inline struct page *idx_to_page(
167 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
169 unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
170 return foreign_pages[mmap_idx][arr_idx];
172 static inline unsigned long idx_to_kaddr(
173 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
175 unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx));
176 return (unsigned long)pfn_to_kaddr(pfn);
179 static unsigned short mmap_alloc = 0;
180 static unsigned short mmap_lock = 0;
181 static unsigned short mmap_inuse = 0;
183 /******************************************************************
187 /* When using grant tables to map a frame for device access then the
188 * handle returned must be used to unmap the frame. This is needed to
189 * drop the ref count on the frame.
191 struct grant_handle_pair
193 grant_handle_t kernel;
196 #define INVALID_GRANT_HANDLE 0xFFFF
198 static struct grant_handle_pair
199 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
200 #define pending_handle(_id, _idx, _i) \
201 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
205 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
207 #define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
208 #define BLKTAP_DEV_DIR "/dev/xen"
210 static int blktap_major;
213 #define BLKTAP_IOCTL_KICK_FE 1
214 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
215 #define BLKTAP_IOCTL_SETMODE 3
216 #define BLKTAP_IOCTL_SENDPID 4
217 #define BLKTAP_IOCTL_NEWINTF 5
218 #define BLKTAP_IOCTL_MINOR 6
219 #define BLKTAP_IOCTL_MAJOR 7
220 #define BLKTAP_QUERY_ALLOC_REQS 8
221 #define BLKTAP_IOCTL_FREEINTF 9
222 #define BLKTAP_IOCTL_NEWINTF_EXT 50
223 #define BLKTAP_IOCTL_PRINT_IDXS 100
225 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
226 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
227 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
228 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
230 #define BLKTAP_MODE_INTERPOSE \
231 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
234 static inline int BLKTAP_MODE_VALID(unsigned long arg)
236 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
237 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
238 (arg == BLKTAP_MODE_INTERPOSE ));
241 /* Requests passing through the tap to userspace are re-assigned an ID.
242 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
246 #define INVALID_MIDX 0xdead
248 /*TODO: Convert to a free list*/
249 static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map)
253 for (i = 0; i < MAX_PENDING_REQS; i++)
254 if (idx_map[i].mem == INVALID_MIDX)
260 static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset)
262 return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
265 static inline unsigned int OFFSET_TO_SEG(unsigned long offset)
267 return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
271 #define BLKTAP_INVALID_HANDLE(_g) \
272 (((_g->kernel) == INVALID_GRANT_HANDLE) && \
273 ((_g->user) == INVALID_GRANT_HANDLE))
275 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
276 (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
280 static char *blktap_devnode(struct device *dev, umode_t *mode)
282 return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
285 static struct device_type blktap_type = {
286 .devnode = blktap_devnode
289 /******************************************************************
293 static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
296 * if the page has not been mapped in by the driver then return
297 * VM_FAULT_SIGBUS to the domain.
300 return VM_FAULT_SIGBUS;
303 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
304 unsigned long uvaddr,
305 pte_t *ptep, int is_fullmm)
308 tap_blkif_t *info = NULL;
309 unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0;
310 unsigned long offset;
312 struct grant_handle_pair *khandle;
313 struct gnttab_unmap_grant_ref unmap[2];
316 * If the address is before the start of the grant mapped region or
317 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
319 if (vma->vm_file != NULL)
320 info = vma->vm_file->private_data;
321 if (info == NULL || uvaddr < info->user_vstart)
322 return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
325 offset = (uvaddr - info->user_vstart) >> PAGE_SHIFT;
326 usr_idx = OFFSET_TO_USR_IDX(offset);
327 seg = OFFSET_TO_SEG(offset);
329 spin_lock(&info->map_lock);
331 pending_idx = info->idx_map[usr_idx].req;
332 mmap_idx = info->idx_map[usr_idx].mem;
334 /* fast_flush_area() may already have cleared this entry */
335 if (mmap_idx == INVALID_MIDX) {
336 spin_unlock(&info->map_lock);
337 return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
341 pg = idx_to_page(mmap_idx, pending_idx, seg);
342 ClearPageReserved(pg);
343 info->foreign_map.map[offset + RING_PAGES] = NULL;
345 khandle = &pending_handle(mmap_idx, pending_idx, seg);
347 if (khandle->kernel != INVALID_GRANT_HANDLE) {
348 unsigned long pfn = page_to_pfn(pg);
350 gnttab_set_unmap_op(&unmap[count],
351 (unsigned long)pfn_to_kaddr(pfn),
352 GNTMAP_host_map, khandle->kernel);
355 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
358 if (khandle->user != INVALID_GRANT_HANDLE) {
359 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
362 gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
364 | GNTMAP_application_map
365 | GNTMAP_contains_pte,
369 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
371 /* USING SHADOW PAGE TABLES. */
372 copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
377 BLKTAP_INVALIDATE_HANDLE(khandle);
378 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
383 spin_unlock(&info->map_lock);
388 static void blktap_vma_open(struct vm_area_struct *vma)
391 if (vma->vm_file == NULL)
394 info = vma->vm_file->private_data;
395 vma->vm_private_data =
396 &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
400 * When partial munmapping, ->open() is called only splitted vma which
401 * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
402 * So there is no chance to fix up vm_private_data of the end vma.
404 static void blktap_vma_close(struct vm_area_struct *vma)
407 struct vm_area_struct *next = vma->vm_next;
410 vma->vm_ops != next->vm_ops ||
411 vma->vm_end != next->vm_start ||
412 vma->vm_file == NULL ||
413 vma->vm_file != next->vm_file)
416 info = vma->vm_file->private_data;
417 next->vm_private_data =
418 &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
421 static struct vm_operations_struct blktap_vm_ops = {
423 zap_pte: blktap_clear_pte,
424 open: blktap_vma_open,
425 close: blktap_vma_close,
428 /******************************************************************
432 /*Function Declarations*/
433 static tap_blkif_t *get_next_free_dev(void);
434 static int blktap_open(struct inode *inode, struct file *filp);
435 static int blktap_release(struct inode *inode, struct file *filp);
436 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
437 static long blktap_ioctl(struct file *filp, unsigned int cmd,
439 static unsigned int blktap_poll(struct file *file, poll_table *wait);
441 static const struct file_operations blktap_fops = {
442 .owner = THIS_MODULE,
444 .unlocked_ioctl = blktap_ioctl,
446 .release = blktap_release,
452 static tap_blkif_t *get_next_free_dev(void)
458 * This is called only from the ioctl, which
459 * means we should always have interrupts enabled.
461 BUG_ON(irqs_disabled());
463 spin_lock_irq(&pending_free_lock);
465 /* tapfds[0] is always NULL */
467 for (minor = 1; minor < blktap_next_minor; minor++) {
468 info = tapfds[minor];
469 /* we could have failed a previous attempt. */
471 ((!test_bit(0, &info->dev_inuse)) &&
472 (info->dev_pending == 0)) ) {
473 info->dev_pending = 1;
481 * We didn't find free device. If we can still allocate
482 * more, then we grab the next device minor that is
483 * available. This is done while we are still under
484 * the protection of the pending_free_lock.
486 if (blktap_next_minor < MAX_TAP_DEV)
487 minor = blktap_next_minor++;
489 spin_unlock_irq(&pending_free_lock);
491 if (!info && minor > 0) {
492 info = kzalloc(sizeof(*info), GFP_KERNEL);
493 if (unlikely(!info)) {
495 * If we failed here, try to put back
496 * the next minor number. But if one
497 * was just taken, then we just lose this
498 * minor. We can try to allocate this
501 spin_lock_irq(&pending_free_lock);
502 if (blktap_next_minor == minor+1)
504 spin_unlock_irq(&pending_free_lock);
509 spin_lock_init(&info->map_lock);
511 * Make sure that we have a minor before others can
515 tapfds[minor] = info;
517 xen_class_device_create(&blktap_type, NULL,
518 MKDEV(blktap_major, minor),
519 NULL, "blktap%d", minor);
526 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
531 for (i = 1; i < blktap_next_minor; i++) {
534 (info->trans.domid == domid) &&
535 (info->trans.busid == xenbus_id) ) {
537 info->status = RUNNING;
544 void signal_tapdisk(int idx)
547 struct task_struct *ptask;
548 struct mm_struct *mm;
551 * if the userland tools set things up wrong, this could be negative;
552 * just don't try to signal in this case
554 if (idx < 0 || idx >= MAX_TAP_DEV)
562 ptask = pid_task(find_pid_ns(info->pid, info->pid_ns),
565 info->status = CLEANSHUTDOWN;
569 mm = xchg(&info->mm, NULL);
574 static int blktap_open(struct inode *inode, struct file *filp)
576 blkif_sring_t *sring;
577 int idx = iminor(inode) - BLKTAP_MINOR;
581 nonseekable_open(inode, filp);
583 /* ctrl device, treat differently */
586 if (idx < 0 || idx >= MAX_TAP_DEV) {
587 WPRINTK("No device /dev/xen/blktap%d\n", idx);
593 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
598 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
600 /*Only one process can access device at a time*/
601 if (test_and_set_bit(0, &info->dev_inuse))
604 info->dev_pending = 0;
606 /* Allocate the fe ring. */
607 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
611 SetPageReserved(virt_to_page(sring));
613 SHARED_RING_INIT(sring);
614 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
616 filp->private_data = info;
619 info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS,
622 if (info->idx_map == NULL)
626 init_waitqueue_head(&info->wait);
627 for (i = 0; i < MAX_PENDING_REQS; i++) {
628 info->idx_map[i].mem = INVALID_MIDX;
629 info->idx_map[i].req = ~0;
633 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
640 static int blktap_release(struct inode *inode, struct file *filp)
642 tap_blkif_t *info = filp->private_data;
643 struct mm_struct *mm;
645 /* check for control device */
651 info->rings_vstart = 0;
653 mm = xchg(&info->mm, NULL);
656 kfree(info->foreign_map.map);
657 info->foreign_map.map = NULL;
659 /* Free the ring page. */
660 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
661 free_page((unsigned long) info->ufe_ring.sring);
664 kfree(info->idx_map);
665 info->idx_map = NULL;
668 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
669 if (info->blkif->xenblkd != NULL) {
670 kthread_stop(info->blkif->xenblkd);
671 info->blkif->xenblkd = NULL;
673 info->status = CLEANSHUTDOWN;
676 clear_bit(0, &info->dev_inuse);
677 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
684 * We need to map pages to user space in a way that will allow the block
685 * subsystem set up direct IO to them. This couldn't be done before, because
686 * there isn't really a sane way to translate a user virtual address down to a
687 * physical address when the page belongs to another domain.
689 * My first approach was to map the page in to kernel memory, add an entry
690 * for it in the physical frame list (using alloc_lomem_region as in blkback)
691 * and then attempt to map that page up to user space. This is disallowed
692 * by xen though, which realizes that we don't really own the machine frame
693 * underlying the physical page.
695 * The new approach is to provide explicit support for this in xen linux.
696 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
697 * mapped from other vms. vma->vm_private_data is set up as a mapping
698 * from pages to actual page structs. There is a new clause in get_user_pages
699 * that does the right thing for this sort of mapping.
701 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
704 tap_blkif_t *info = filp->private_data;
708 WPRINTK("mmap: no private data?\n");
712 if (info->rings_vstart) {
713 WPRINTK("mmap already called on filp %p (minor %d)\n",
718 vma->vm_flags |= VM_RESERVED;
719 vma->vm_ops = &blktap_vm_ops;
721 size = vma->vm_end - vma->vm_start;
722 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
723 WPRINTK("you _must_ map exactly %d pages!\n",
724 mmap_pages + RING_PAGES);
729 info->rings_vstart = vma->vm_start;
730 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
732 /* Map the ring pages to the start of the region and reserve it. */
733 if (xen_feature(XENFEAT_auto_translated_physmap))
734 ret = vm_insert_page(vma, vma->vm_start,
735 virt_to_page(info->ufe_ring.sring));
737 ret = remap_pfn_range(vma, vma->vm_start,
738 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
739 PAGE_SIZE, vma->vm_page_prot);
741 WPRINTK("Mapping user ring failed!\n");
745 /* Mark this VM as containing foreign pages, and set up mappings. */
746 info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
747 sizeof(*info->foreign_map.map), GFP_KERNEL);
748 if (info->foreign_map.map == NULL) {
749 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
753 vma->vm_private_data = &info->foreign_map;
754 vma->vm_flags |= VM_FOREIGN;
755 vma->vm_flags |= VM_DONTCOPY;
758 vma->vm_mm->context.has_foreign_mappings = 1;
761 info->mm = get_task_mm(current);
766 /* Clear any active mappings. */
767 zap_page_range(vma, vma->vm_start,
768 vma->vm_end - vma->vm_start, NULL);
769 info->rings_vstart = 0;
775 static long blktap_ioctl(struct file *filp, unsigned int cmd,
778 tap_blkif_t *info = filp->private_data;
781 case BLKTAP_IOCTL_KICK_FE:
783 /* There are fe messages to process. */
784 return blktap_read_ufe_ring(info);
786 case BLKTAP_IOCTL_SETMODE:
789 if (BLKTAP_MODE_VALID(arg)) {
791 /* XXX: may need to flush rings here. */
792 DPRINTK("set mode to %lx\n", arg);
798 case BLKTAP_IOCTL_PRINT_IDXS:
801 pr_info("User Rings: \n-----------\n");
802 pr_info("UF: rsp_cons: %2d, req_prod_prv: %2d "
803 "| req_prod: %2d, rsp_prod: %2d\n",
804 info->ufe_ring.rsp_cons,
805 info->ufe_ring.req_prod_pvt,
806 info->ufe_ring.sring->req_prod,
807 info->ufe_ring.sring->rsp_prod);
811 case BLKTAP_IOCTL_SENDPID:
814 info->pid = (pid_t)arg;
815 info->pid_ns = current->nsproxy->pid_ns;
816 DPRINTK("pid received %p:%d\n",
817 info->pid_ns, info->pid);
821 case BLKTAP_IOCTL_NEWINTF:
823 uint64_t val = (uint64_t)arg;
824 domid_translate_t *tr = (domid_translate_t *)&val;
826 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
827 tr->domid, tr->busid);
828 info = get_next_free_dev();
830 WPRINTK("Error initialising /dev/xen/blktap - "
831 "No more devices\n");
834 info->trans.domid = tr->domid;
835 info->trans.busid = tr->busid;
838 case BLKTAP_IOCTL_NEWINTF_EXT:
840 void __user *udata = (void __user *) arg;
841 domid_translate_ext_t tr;
843 if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
846 DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n",
848 info = get_next_free_dev();
850 WPRINTK("Error initialising /dev/xen/blktap - "
851 "No more devices\n");
854 info->trans.domid = tr.domid;
855 info->trans.busid = tr.busid;
858 case BLKTAP_IOCTL_FREEINTF:
860 unsigned long dev = arg;
863 if (info || dev >= MAX_TAP_DEV)
868 return 0; /* should this be an error? */
870 spin_lock_irqsave(&pending_free_lock, flags);
871 if (info->dev_pending)
872 info->dev_pending = 0;
873 spin_unlock_irqrestore(&pending_free_lock, flags);
877 case BLKTAP_IOCTL_MINOR:
879 unsigned long dev = arg;
881 if (dev >= MAX_TAP_DEV)
891 case BLKTAP_IOCTL_MAJOR:
894 case BLKTAP_QUERY_ALLOC_REQS:
895 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n",
896 alloc_pending_reqs, MAX_PENDING_REQS);
897 return (alloc_pending_reqs/MAX_PENDING_REQS) * 100;
902 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
904 tap_blkif_t *info = filp->private_data;
906 /* do not work on the control device */
910 poll_wait(filp, &info->wait, wait);
911 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
912 RING_PUSH_REQUESTS(&info->ufe_ring);
913 return POLLIN | POLLRDNORM;
918 static void blktap_kick_user(int idx)
922 if (idx < 0 || idx >= MAX_TAP_DEV)
929 wake_up_interruptible(&info->wait);
934 static int do_block_io_op(blkif_t *blkif);
935 static void dispatch_rw_block_io(blkif_t *blkif,
936 blkif_request_t *req,
937 pending_req_t *pending_req);
938 static void make_response(blkif_t *blkif, u64 id,
939 unsigned short op, int st);
941 /******************************************************************
944 static int req_increase(void)
948 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
951 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
952 * MAX_PENDING_REQS, GFP_KERNEL);
953 foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
955 if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
958 DPRINTK("reqs=%lu, pages=%d\n", MAX_PENDING_REQS, mmap_pages);
960 for (i = 0; i < MAX_PENDING_REQS; i++) {
961 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
963 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
964 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
965 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
970 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
974 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
975 kfree(pending_reqs[mmap_alloc]);
976 WPRINTK("%s: out of memory\n", __FUNCTION__);
980 static void mmap_req_del(int mmap)
982 assert_spin_locked(&pending_free_lock);
984 kfree(pending_reqs[mmap]);
985 pending_reqs[mmap] = NULL;
987 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
988 foreign_pages[mmap] = NULL;
991 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
995 static pending_req_t* alloc_req(void)
997 pending_req_t *req = NULL;
1000 spin_lock_irqsave(&pending_free_lock, flags);
1002 if (!list_empty(&pending_free)) {
1003 req = list_entry(pending_free.next, pending_req_t, free_list);
1004 list_del(&req->free_list);
1008 alloc_pending_reqs++;
1009 spin_unlock_irqrestore(&pending_free_lock, flags);
1014 static void free_req(pending_req_t *req)
1016 unsigned long flags;
1019 spin_lock_irqsave(&pending_free_lock, flags);
1021 alloc_pending_reqs--;
1022 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
1024 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
1025 spin_unlock_irqrestore(&pending_free_lock, flags);
1028 was_empty = list_empty(&pending_free);
1029 list_add(&req->free_list, &pending_free);
1031 spin_unlock_irqrestore(&pending_free_lock, flags);
1034 wake_up(&pending_free_wq);
1037 static void blktap_zap_page_range(struct mm_struct *mm,
1038 unsigned long uvaddr, int nr_pages)
1040 unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
1041 struct vm_area_struct *vma;
1043 vma = find_vma(mm, uvaddr);
1044 while (vma && uvaddr < end) {
1045 unsigned long s = max(uvaddr, vma->vm_start);
1046 unsigned long e = min(end, vma->vm_end);
1048 zap_page_range(vma, s, e - s, NULL);
1055 static void fast_flush_area(pending_req_t *req, unsigned int k_idx,
1056 unsigned int u_idx, tap_blkif_t *info)
1058 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1059 unsigned int i, mmap_idx, invcount = 0;
1060 struct grant_handle_pair *khandle;
1063 unsigned long uvaddr;
1064 struct mm_struct *mm = info->mm;
1067 down_read(&mm->mmap_sem);
1069 if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
1071 blktap_zap_page_range(mm,
1072 MMAP_VADDR(info->user_vstart, u_idx, 0),
1074 info->idx_map[u_idx].mem = INVALID_MIDX;
1075 up_read(&mm->mmap_sem);
1079 mmap_idx = req->mem_idx;
1081 spin_lock(&info->map_lock);
1083 for (i = 0; i < req->nr_pages; i++) {
1084 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
1086 khandle = &pending_handle(mmap_idx, k_idx, i);
1088 if (khandle->kernel != INVALID_GRANT_HANDLE) {
1089 gnttab_set_unmap_op(&unmap[invcount],
1090 idx_to_kaddr(mmap_idx, k_idx, i),
1091 GNTMAP_host_map, khandle->kernel);
1094 set_phys_to_machine(
1095 page_to_pfn(idx_to_page(mmap_idx, k_idx, i)),
1099 if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) {
1100 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1101 if (create_lookup_pte_addr(
1103 MMAP_VADDR(info->user_vstart, u_idx, i),
1105 spin_unlock(&info->map_lock);
1106 WPRINTK("Couldn't get a pte addr!\n");
1110 gnttab_set_unmap_op(&unmap[invcount], ptep,
1112 | GNTMAP_application_map
1113 | GNTMAP_contains_pte,
1118 BLKTAP_INVALIDATE_HANDLE(khandle);
1120 ret = HYPERVISOR_grant_table_op(
1121 GNTTABOP_unmap_grant_ref, unmap, invcount);
1124 info->idx_map[u_idx].mem = INVALID_MIDX;
1126 spin_unlock(&info->map_lock);
1128 up_read(&mm->mmap_sem);
1131 /******************************************************************
1132 * SCHEDULER FUNCTIONS
1135 static void print_stats(blkif_t *blkif)
1137 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | pk %4d\n",
1138 current->comm, blkif->st_oo_req,
1139 blkif->st_rd_req, blkif->st_wr_req, blkif->st_pk_req);
1140 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1141 blkif->st_rd_req = 0;
1142 blkif->st_wr_req = 0;
1143 blkif->st_oo_req = 0;
1144 blkif->st_pk_req = 0;
1147 int tap_blkif_schedule(void *arg)
1149 blkif_t *blkif = arg;
1155 printk(KERN_DEBUG "%s: started\n", current->comm);
1157 while (!kthread_should_stop()) {
1158 if (try_to_freeze())
1161 wait_event_interruptible(
1163 blkif->waiting_reqs || kthread_should_stop());
1164 wait_event_interruptible(
1166 !list_empty(&pending_free) || kthread_should_stop());
1168 blkif->waiting_reqs = 0;
1169 smp_mb(); /* clear flag *before* checking for work */
1171 if (do_block_io_op(blkif))
1172 blkif->waiting_reqs = 1;
1174 if (log_stats && time_after(jiffies, blkif->st_print))
1181 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1183 blkif->xenblkd = NULL;
1184 info = tapfds[blkif->dev_num];
1188 struct mm_struct *mm = xchg(&info->mm, NULL);
1197 /******************************************************************
1198 * COMPLETION CALLBACK -- Called by user level ioctl()
1201 static int blktap_read_ufe_ring(tap_blkif_t *info)
1203 /* This is called to read responses from the UFE ring. */
1205 blkif_response_t *resp;
1206 blkif_t *blkif=NULL;
1207 unsigned int pending_idx, usr_idx, mmap_idx;
1208 pending_req_t *pending_req;
1213 /* We currently only forward packets in INTERCEPT_FE mode. */
1214 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1217 /* for each outstanding message on the UFEring */
1218 rp = info->ufe_ring.sring->rsp_prod;
1221 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1222 blkif_response_t res;
1223 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1224 memcpy(&res, resp, sizeof(res));
1225 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1226 ++info->ufe_ring.rsp_cons;
1228 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1229 if (res.id >= MAX_PENDING_REQS) {
1230 WPRINTK("incorrect req map [%llx]\n",
1231 (unsigned long long)res.id);
1235 usr_idx = (unsigned int)res.id;
1236 pending_idx = info->idx_map[usr_idx].req;
1237 mmap_idx = info->idx_map[usr_idx].mem;
1239 if (mmap_idx >= mmap_alloc ||
1240 pending_idx >= MAX_PENDING_REQS) {
1241 WPRINTK("incorrect req map [%d],"
1242 " internal map [%d,%d]\n",
1243 usr_idx, mmap_idx, pending_idx);
1247 pending_req = &pending_reqs[mmap_idx][pending_idx];
1248 blkif = pending_req->blkif;
1250 for (j = 0; j < pending_req->nr_pages; j++) {
1252 unsigned long uvaddr;
1256 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1258 pg = idx_to_page(mmap_idx, pending_idx, j);
1259 ClearPageReserved(pg);
1260 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1261 info->foreign_map.map[offset] = NULL;
1263 fast_flush_area(pending_req, pending_idx, usr_idx, info);
1264 make_response(blkif, pending_req->id, res.operation,
1266 blkif_put(pending_req->blkif);
1267 free_req(pending_req);
1274 /******************************************************************************
1275 * NOTIFICATION FROM GUEST OS.
1278 static void blkif_notify_work(blkif_t *blkif)
1280 blkif->waiting_reqs = 1;
1281 wake_up(&blkif->wq);
1284 irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
1286 blkif_notify_work(dev_id);
1292 /******************************************************************
1293 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1295 static int print_dbug = 1;
1296 static int do_block_io_op(blkif_t *blkif)
1298 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1299 blkif_request_t req;
1300 pending_req_t *pending_req;
1305 rc = blk_rings->common.req_cons;
1306 rp = blk_rings->common.sring->req_prod;
1307 rmb(); /* Ensure we see queued requests up to 'rp'. */
1309 /*Check blkif has corresponding UE ring*/
1310 if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) {
1313 WPRINTK("Corresponding UE "
1314 "ring does not exist!\n");
1315 print_dbug = 0; /*We only print this message once*/
1320 info = tapfds[blkif->dev_num];
1322 if (!info || !test_bit(0, &info->dev_inuse)) {
1324 WPRINTK("Can't get UE info!\n");
1332 if (RING_FULL(&info->ufe_ring)) {
1333 WPRINTK("RING_FULL! More to do\n");
1338 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1339 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1345 if (kthread_should_stop()) {
1350 pending_req = alloc_req();
1351 if (NULL == pending_req) {
1357 switch (blkif->blk_protocol) {
1358 case BLKIF_PROTOCOL_NATIVE:
1359 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1362 case BLKIF_PROTOCOL_X86_32:
1363 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1365 case BLKIF_PROTOCOL_X86_64:
1366 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1371 blk_rings->common.req_cons = ++rc; /* before make_response() */
1373 /* Apply all sanity checks to /private copy/ of request. */
1376 switch (req.operation) {
1379 dispatch_rw_block_io(blkif, &req, pending_req);
1382 case BLKIF_OP_WRITE_BARRIER:
1383 /* TODO Some counter? */
1385 case BLKIF_OP_WRITE:
1387 dispatch_rw_block_io(blkif, &req, pending_req);
1390 case BLKIF_OP_PACKET:
1392 dispatch_rw_block_io(blkif, &req, pending_req);
1396 /* A good sign something is wrong: sleep for a while to
1397 * avoid excessive CPU consumption by a bad guest. */
1399 WPRINTK("unknown operation [%d]\n",
1401 make_response(blkif, req.id, req.operation,
1403 free_req(pending_req);
1407 /* Yield point for this unbounded loop. */
1411 blktap_kick_user(blkif->dev_num);
1416 static void dispatch_rw_block_io(blkif_t *blkif,
1417 blkif_request_t *req,
1418 pending_req_t *pending_req)
1420 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1422 int ret, i, op, nr_sects = 0;
1424 blkif_request_t *target;
1425 unsigned int mmap_idx = pending_req->mem_idx;
1426 unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx);
1427 unsigned int usr_idx;
1429 struct mm_struct *mm;
1430 struct vm_area_struct *vma = NULL;
1432 if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV)
1435 info = tapfds[blkif->dev_num];
1439 /* Check we have space on user ring - should never fail. */
1440 spin_lock(&info->map_lock);
1441 usr_idx = GET_NEXT_REQ(info->idx_map);
1442 spin_unlock(&info->map_lock);
1443 if (usr_idx >= MAX_PENDING_REQS) {
1448 /* Check that number of segments is sane. */
1449 nseg = req->nr_segments;
1450 if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
1451 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1452 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1456 /* Make sure userspace is ready. */
1457 if (!info->ring_ok) {
1458 WPRINTK("ring not ready for requests!\n");
1463 if (RING_FULL(&info->ufe_ring)) {
1464 WPRINTK("fe_ring is full, "
1465 "IO Request will be dropped. %d %d\n",
1466 RING_SIZE(&info->ufe_ring),
1467 RING_SIZE(&blkif->blk_rings.common));
1471 pending_req->blkif = blkif;
1472 pending_req->id = req->id;
1473 pending_req->nr_pages = nseg;
1475 flags = GNTMAP_host_map;
1476 switch (req->operation) {
1477 case BLKIF_OP_WRITE:
1478 case BLKIF_OP_WRITE_BARRIER:
1479 flags |= GNTMAP_readonly;
1485 if (!xen_feature(XENFEAT_auto_translated_physmap))
1486 down_read(&mm->mmap_sem);
1487 for (i = 0; i < nseg; i++) {
1488 unsigned long uvaddr;
1489 unsigned long kvaddr;
1492 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1493 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1495 gnttab_set_map_op(&map[op], kvaddr, flags,
1496 req->seg[i].gref, blkif->domid);
1499 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1500 /* Now map it to user. */
1501 ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
1503 up_read(&mm->mmap_sem);
1504 WPRINTK("Couldn't get a pte addr!\n");
1508 gnttab_set_map_op(&map[op], ptep,
1509 flags | GNTMAP_application_map
1510 | GNTMAP_contains_pte,
1511 req->seg[i].gref, blkif->domid);
1515 nr_sects += (req->seg[i].last_sect -
1516 req->seg[i].first_sect + 1);
1519 if (xen_feature(XENFEAT_auto_translated_physmap))
1520 down_read(&mm->mmap_sem);
1522 spin_lock(&info->map_lock);
1524 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1527 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1528 for (i = 0; i < (nseg*2); i+=2) {
1529 unsigned long uvaddr;
1530 unsigned long offset;
1533 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1535 gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
1537 if (unlikely(map[i].status != GNTST_okay)) {
1538 WPRINTK("invalid kernel buffer -- could not remap it\n");
1540 map[i].handle = INVALID_GRANT_HANDLE;
1543 if (unlikely(map[i+1].status != GNTST_okay)) {
1544 WPRINTK("invalid user buffer -- could not remap it\n");
1546 map[i+1].handle = INVALID_GRANT_HANDLE;
1549 pending_handle(mmap_idx, pending_idx, i/2).kernel
1551 pending_handle(mmap_idx, pending_idx, i/2).user
1557 pg = idx_to_page(mmap_idx, pending_idx, i/2);
1558 set_phys_to_machine(page_to_pfn(pg),
1559 FOREIGN_FRAME(map[i].dev_bus_addr
1561 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1562 info->foreign_map.map[offset] = pg;
1565 for (i = 0; i < nseg; i++) {
1566 unsigned long uvaddr;
1567 unsigned long offset;
1570 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1572 gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
1574 if (unlikely(map[i].status != GNTST_okay)) {
1575 WPRINTK("invalid kernel buffer -- could not remap it\n");
1577 map[i].handle = INVALID_GRANT_HANDLE;
1580 pending_handle(mmap_idx, pending_idx, i).kernel
1586 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1587 pg = idx_to_page(mmap_idx, pending_idx, i);
1588 info->foreign_map.map[offset] = pg;
1592 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1593 info->idx_map[usr_idx].mem = mmap_idx;
1594 info->idx_map[usr_idx].req = pending_idx;
1596 spin_unlock(&info->map_lock);
1601 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1602 for (i = 0; i < nseg; i++) {
1603 struct page *pg = idx_to_page(mmap_idx, pending_idx, i);
1604 unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
1606 if (vma && uvaddr >= vma->vm_end) {
1609 (uvaddr < vma->vm_start ||
1610 uvaddr >= vma->vm_end))
1614 vma = find_vma(mm, uvaddr);
1615 /* this virtual area was already munmapped.
1616 so skip to next page */
1620 ret = vm_insert_page(vma, uvaddr, pg);
1626 up_read(&mm->mmap_sem);
1629 /* Finally, write the request message to the user ring. */
1630 target = RING_GET_REQUEST(&info->ufe_ring,
1631 info->ufe_ring.req_prod_pvt);
1632 memcpy(target, req, sizeof(*req));
1633 target->id = usr_idx;
1634 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1635 info->ufe_ring.req_prod_pvt++;
1637 switch (req->operation) {
1639 blkif->st_rd_sect += nr_sects;
1641 case BLKIF_OP_WRITE:
1642 case BLKIF_OP_WRITE_BARRIER:
1643 blkif->st_wr_sect += nr_sects;
1650 up_read(&mm->mmap_sem);
1651 WPRINTK("Reached Fail_flush\n");
1652 fast_flush_area(pending_req, pending_idx, usr_idx, info);
1654 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1655 free_req(pending_req);
1656 msleep(1); /* back off a bit */
1661 /******************************************************************
1662 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1666 static void make_response(blkif_t *blkif, u64 id,
1667 unsigned short op, int st)
1669 blkif_response_t resp;
1670 unsigned long flags;
1671 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1676 resp.operation = op;
1679 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1680 /* Place on the response ring for the relevant domain. */
1681 switch (blkif->blk_protocol) {
1682 case BLKIF_PROTOCOL_NATIVE:
1683 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1684 blk_rings->native.rsp_prod_pvt),
1685 &resp, sizeof(resp));
1687 case BLKIF_PROTOCOL_X86_32:
1688 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1689 blk_rings->x86_32.rsp_prod_pvt),
1690 &resp, sizeof(resp));
1692 case BLKIF_PROTOCOL_X86_64:
1693 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1694 blk_rings->x86_64.rsp_prod_pvt),
1695 &resp, sizeof(resp));
1700 blk_rings->common.rsp_prod_pvt++;
1701 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1703 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1705 * Tail check for pending requests. Allows frontend to avoid
1706 * notifications if requests are already in flight (lower
1707 * overheads and promotes batching).
1709 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1710 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1714 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1716 blkif_notify_work(blkif);
1718 notify_remote_via_irq(blkif->irq);
1721 static int __init blkif_init(void)
1725 if (!is_running_on_xen())
1728 INIT_LIST_HEAD(&pending_free);
1729 for(i = 0; i < 2; i++) {
1730 ret = req_increase();
1737 tap_blkif_interface_init();
1739 alloc_pending_reqs = 0;
1741 tap_blkif_xenbus_init();
1743 /* Dynamically allocate a major for this device */
1744 ret = __register_chrdev(0, 0, MAX_TAP_DEV, "blktap", &blktap_fops);
1747 WPRINTK("Couldn't register /dev/xen/blktap\n");
1753 /* tapfds[0] is always NULL */
1754 blktap_next_minor++;
1756 DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret);
1758 /* Make sure the xen class exists */
1759 if (get_xen_class()) {
1761 * This will allow udev to create the blktap ctrl device.
1762 * We only want to create blktap0 first. We don't want
1763 * to flood the sysfs system with needless blktap devices.
1764 * We only create the device when a request of a new device is
1767 xen_class_device_create(&blktap_type, NULL,
1768 MKDEV(blktap_major, 0), NULL,
1771 /* this is bad, but not fatal */
1772 WPRINTK("sysfs xen_class not created\n");
1775 DPRINTK("Blktap device successfully created\n");
1780 module_init(blkif_init);
1782 MODULE_LICENSE("Dual BSD/GPL");
1783 MODULE_ALIAS("devname:xen/blktap0");
1784 MODULE_ALIAS("xen-backend:tap");