1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
9 * Based on the blkback driver code.
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
13 * Clean ups and fix ups:
14 * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License version 2
18 * as published by the Free Software Foundation; or, when distributed
19 * separately from the Linux kernel or incorporated into other
20 * software packages, subject to the following license:
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this source file (the "Software"), to deal in the Software without
24 * restriction, including without limitation the rights to use, copy, modify,
25 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26 * and to permit persons to whom the Software is furnished to do so, subject to
27 * the following conditions:
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/freezer.h>
44 #include <linux/list.h>
45 #include <asm/hypervisor.h>
47 #include <xen/balloon.h>
48 #include <xen/driver_util.h>
49 #include <linux/kernel.h>
52 #include <linux/errno.h>
53 #include <linux/major.h>
54 #include <linux/gfp.h>
55 #include <linux/poll.h>
56 #include <linux/delay.h>
57 #include <asm/tlbflush.h>
59 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
60 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
63 * The maximum number of requests that can be outstanding at any time
66 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
68 * where mmap_alloc < MAX_DYNAMIC_MEM.
71 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
74 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
75 #define MAX_DYNAMIC_MEM BLK_RING_SIZE
76 #define MAX_PENDING_REQS BLK_RING_SIZE
77 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
78 #define MMAP_VADDR(_start, _req,_seg) \
80 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
82 static int blkif_reqs = MAX_PENDING_REQS;
83 static int mmap_pages = MMAP_PAGES;
85 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
86 * have a bunch of pages reserved for shared
90 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
91 typedef struct domid_translate {
96 /*Data struct associated with each of the tapdisk devices*/
97 typedef struct tap_blkif {
98 struct vm_area_struct *vma; /*Shared memory area */
99 unsigned long rings_vstart; /*Kernel memory mapping */
100 unsigned long user_vstart; /*User memory mapping */
101 unsigned long dev_inuse; /*One process opens device at a time. */
102 unsigned long dev_pending; /*In process of being opened */
103 unsigned long ring_ok; /*make this ring->state */
104 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
105 wait_queue_head_t wait; /*for poll */
106 unsigned long mode; /*current switching mode */
107 int minor; /*Minor number for tapdisk device */
108 pid_t pid; /*tapdisk process id */
109 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
111 unsigned long *idx_map; /*Record the user ring id to kern
112 [req id, idx] tuple */
113 blkif_t *blkif; /*Associate blkif with tapdev */
114 struct domid_translate trans; /*Translation from domid to bus. */
117 static struct tap_blkif *tapfds[MAX_TAP_DEV];
118 static int blktap_next_minor;
120 module_param(blkif_reqs, int, 0);
121 /* Run-time switchable: /sys/module/blktap/parameters/ */
122 static unsigned int log_stats = 0;
123 static unsigned int debug_lvl = 0;
124 module_param(log_stats, int, 0644);
125 module_param(debug_lvl, int, 0644);
128 * Each outstanding request that we've passed to the lower device layers has a
129 * 'pending_req' allocated to it. Each buffer_head that completes decrements
130 * the pendcnt towards zero. When it hits zero, the specified domain has a
131 * response queued for it, with the saved 'id' passed back.
136 unsigned short mem_idx;
139 unsigned short operation;
141 struct list_head free_list;
145 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
146 static struct list_head pending_free;
147 static DEFINE_SPINLOCK(pending_free_lock);
148 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
149 static int alloc_pending_reqs;
151 typedef unsigned int PEND_RING_IDX;
153 static inline int MASK_PEND_IDX(int i) {
154 return (i & (MAX_PENDING_REQS-1));
157 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
158 return (req - pending_reqs[idx]);
161 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
163 #define BLKBACK_INVALID_HANDLE (~0)
165 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
166 static inline unsigned long idx_to_kaddr(
167 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
169 unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
170 unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
171 return (unsigned long)pfn_to_kaddr(pfn);
174 static unsigned short mmap_alloc = 0;
175 static unsigned short mmap_lock = 0;
176 static unsigned short mmap_inuse = 0;
178 /******************************************************************
182 /* When using grant tables to map a frame for device access then the
183 * handle returned must be used to unmap the frame. This is needed to
184 * drop the ref count on the frame.
186 struct grant_handle_pair
188 grant_handle_t kernel;
191 #define INVALID_GRANT_HANDLE 0xFFFF
193 static struct grant_handle_pair
194 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
195 #define pending_handle(_id, _idx, _i) \
196 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
200 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
202 #define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
203 #define BLKTAP_DEV_DIR "/dev/xen"
205 static int blktap_major;
208 #define BLKTAP_IOCTL_KICK_FE 1
209 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
210 #define BLKTAP_IOCTL_SETMODE 3
211 #define BLKTAP_IOCTL_SENDPID 4
212 #define BLKTAP_IOCTL_NEWINTF 5
213 #define BLKTAP_IOCTL_MINOR 6
214 #define BLKTAP_IOCTL_MAJOR 7
215 #define BLKTAP_QUERY_ALLOC_REQS 8
216 #define BLKTAP_IOCTL_FREEINTF 9
217 #define BLKTAP_IOCTL_PRINT_IDXS 100
219 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
220 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
221 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
222 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
224 #define BLKTAP_MODE_INTERPOSE \
225 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
228 static inline int BLKTAP_MODE_VALID(unsigned long arg)
230 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
231 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
232 (arg == BLKTAP_MODE_INTERPOSE ));
235 /* Requests passing through the tap to userspace are re-assigned an ID.
236 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
240 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
242 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
245 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
247 return (PEND_RING_IDX)(id & 0x0000ffff);
250 extern inline int ID_TO_MIDX(unsigned long id)
252 return (int)(id >> 16);
255 #define INVALID_REQ 0xdead0000
257 /*TODO: Convert to a free list*/
258 static inline int GET_NEXT_REQ(unsigned long *idx_map)
261 for (i = 0; i < MAX_PENDING_REQS; i++)
262 if (idx_map[i] == INVALID_REQ)
268 static inline int OFFSET_TO_USR_IDX(int offset)
270 return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
273 static inline int OFFSET_TO_SEG(int offset)
275 return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
279 #define BLKTAP_INVALID_HANDLE(_g) \
280 (((_g->kernel) == INVALID_GRANT_HANDLE) && \
281 ((_g->user) == INVALID_GRANT_HANDLE))
283 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
284 (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
288 /******************************************************************
292 static struct page *blktap_nopage(struct vm_area_struct *vma,
293 unsigned long address,
297 * if the page has not been mapped in by the driver then return
298 * NOPAGE_SIGBUS to the domain.
301 return NOPAGE_SIGBUS;
304 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
305 unsigned long uvaddr,
306 pte_t *ptep, int is_fullmm)
310 int offset, seg, usr_idx, pending_idx, mmap_idx;
311 unsigned long uvstart = vma->vm_start + (RING_PAGES << PAGE_SHIFT);
312 unsigned long kvaddr;
315 struct grant_handle_pair *khandle;
316 struct gnttab_unmap_grant_ref unmap[2];
320 * If the address is before the start of the grant mapped region or
321 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
323 if (uvaddr < uvstart || vma->vm_file == NULL)
324 return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
327 info = vma->vm_file->private_data;
328 map = vma->vm_private_data;
330 /* TODO Should these be changed to if statements? */
332 BUG_ON(!info->idx_map);
335 offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
336 usr_idx = OFFSET_TO_USR_IDX(offset);
337 seg = OFFSET_TO_SEG(offset);
339 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
340 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
342 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
343 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
344 ClearPageReserved(pg);
345 map[offset + RING_PAGES] = NULL;
347 khandle = &pending_handle(mmap_idx, pending_idx, seg);
349 if (khandle->kernel != INVALID_GRANT_HANDLE) {
350 gnttab_set_unmap_op(&unmap[count], kvaddr,
351 GNTMAP_host_map, khandle->kernel);
354 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
358 if (khandle->user != INVALID_GRANT_HANDLE) {
359 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
362 gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep),
364 | GNTMAP_application_map
365 | GNTMAP_contains_pte,
369 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
371 /* USING SHADOW PAGE TABLES. */
372 copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
377 BLKTAP_INVALIDATE_HANDLE(khandle);
378 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
386 struct vm_operations_struct blktap_vm_ops = {
387 nopage: blktap_nopage,
388 zap_pte: blktap_clear_pte,
391 /******************************************************************
395 /*Function Declarations*/
396 static tap_blkif_t *get_next_free_dev(void);
397 static int blktap_open(struct inode *inode, struct file *filp);
398 static int blktap_release(struct inode *inode, struct file *filp);
399 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
400 static int blktap_ioctl(struct inode *inode, struct file *filp,
401 unsigned int cmd, unsigned long arg);
402 static unsigned int blktap_poll(struct file *file, poll_table *wait);
404 static const struct file_operations blktap_fops = {
405 .owner = THIS_MODULE,
407 .ioctl = blktap_ioctl,
409 .release = blktap_release,
414 static tap_blkif_t *get_next_free_dev(void)
421 * This is called only from the ioctl, which
422 * means we should always have interrupts enabled.
424 BUG_ON(irqs_disabled());
426 spin_lock_irq(&pending_free_lock);
428 /* tapfds[0] is always NULL */
430 for (minor = 1; minor < blktap_next_minor; minor++) {
431 info = tapfds[minor];
432 /* we could have failed a previous attempt. */
434 ((info->dev_inuse == 0) &&
435 (info->dev_pending == 0)) ) {
436 info->dev_pending = 1;
444 * We didn't find free device. If we can still allocate
445 * more, then we grab the next device minor that is
446 * available. This is done while we are still under
447 * the protection of the pending_free_lock.
449 if (blktap_next_minor < MAX_TAP_DEV)
450 minor = blktap_next_minor++;
452 spin_unlock_irq(&pending_free_lock);
454 if (!info && minor > 0) {
455 info = kzalloc(sizeof(*info), GFP_KERNEL);
456 if (unlikely(!info)) {
458 * If we failed here, try to put back
459 * the next minor number. But if one
460 * was just taken, then we just lose this
461 * minor. We can try to allocate this
464 spin_lock_irq(&pending_free_lock);
465 if (blktap_next_minor == minor+1)
467 spin_unlock_irq(&pending_free_lock);
473 * Make sure that we have a minor before others can
477 tapfds[minor] = info;
479 if ((class = get_xen_class()) != NULL)
480 class_device_create(class, NULL,
481 MKDEV(blktap_major, minor), NULL,
489 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
494 for (i = 1; i < blktap_next_minor; i++) {
497 (info->trans.domid == domid) &&
498 (info->trans.busid == xenbus_id) ) {
500 info->status = RUNNING;
507 void signal_tapdisk(int idx)
510 struct task_struct *ptask;
513 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
517 ptask = find_task_by_pid(info->pid);
519 info->status = CLEANSHUTDOWN;
526 static int blktap_open(struct inode *inode, struct file *filp)
528 blkif_sring_t *sring;
529 int idx = iminor(inode) - BLKTAP_MINOR;
533 /* ctrl device, treat differently */
539 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
540 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
545 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
547 /*Only one process can access device at a time*/
548 if (test_and_set_bit(0, &info->dev_inuse))
551 info->dev_pending = 0;
553 /* Allocate the fe ring. */
554 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
558 SetPageReserved(virt_to_page(sring));
560 SHARED_RING_INIT(sring);
561 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
563 filp->private_data = info;
566 info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
569 if (info->idx_map == NULL)
573 init_waitqueue_head(&info->wait);
574 for (i = 0; i < MAX_PENDING_REQS; i++)
575 info->idx_map[i] = INVALID_REQ;
578 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
585 static int blktap_release(struct inode *inode, struct file *filp)
587 tap_blkif_t *info = filp->private_data;
589 /* check for control device */
594 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
596 /* Free the ring page. */
597 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
598 free_page((unsigned long) info->ufe_ring.sring);
600 /* Clear any active mappings and free foreign map table */
603 info->vma, info->vma->vm_start,
604 info->vma->vm_end - info->vma->vm_start, NULL);
606 kfree(info->vma->vm_private_data);
612 kfree(info->idx_map);
613 info->idx_map = NULL;
616 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
617 if (info->blkif->xenblkd != NULL) {
618 kthread_stop(info->blkif->xenblkd);
619 info->blkif->xenblkd = NULL;
621 info->status = CLEANSHUTDOWN;
629 * We need to map pages to user space in a way that will allow the block
630 * subsystem set up direct IO to them. This couldn't be done before, because
631 * there isn't really a sane way to translate a user virtual address down to a
632 * physical address when the page belongs to another domain.
634 * My first approach was to map the page in to kernel memory, add an entry
635 * for it in the physical frame list (using alloc_lomem_region as in blkback)
636 * and then attempt to map that page up to user space. This is disallowed
637 * by xen though, which realizes that we don't really own the machine frame
638 * underlying the physical page.
640 * The new approach is to provide explicit support for this in xen linux.
641 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
642 * mapped from other vms. vma->vm_private_data is set up as a mapping
643 * from pages to actual page structs. There is a new clause in get_user_pages
644 * that does the right thing for this sort of mapping.
646 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
651 tap_blkif_t *info = filp->private_data;
655 WPRINTK("blktap: mmap, retrieving idx failed\n");
659 vma->vm_flags |= VM_RESERVED;
660 vma->vm_ops = &blktap_vm_ops;
662 size = vma->vm_end - vma->vm_start;
663 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
664 WPRINTK("you _must_ map exactly %d pages!\n",
665 mmap_pages + RING_PAGES);
670 info->rings_vstart = vma->vm_start;
671 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
673 /* Map the ring pages to the start of the region and reserve it. */
674 if (xen_feature(XENFEAT_auto_translated_physmap))
675 ret = vm_insert_page(vma, vma->vm_start,
676 virt_to_page(info->ufe_ring.sring));
678 ret = remap_pfn_range(vma, vma->vm_start,
679 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
680 PAGE_SIZE, vma->vm_page_prot);
682 WPRINTK("Mapping user ring failed!\n");
686 /* Mark this VM as containing foreign pages, and set up mappings. */
687 map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
688 * sizeof(struct page_struct*),
691 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
695 for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
698 vma->vm_private_data = map;
699 vma->vm_flags |= VM_FOREIGN;
700 vma->vm_flags |= VM_DONTCOPY;
703 vma->vm_mm->context.has_foreign_mappings = 1;
710 /* Clear any active mappings. */
711 zap_page_range(vma, vma->vm_start,
712 vma->vm_end - vma->vm_start, NULL);
718 static int blktap_ioctl(struct inode *inode, struct file *filp,
719 unsigned int cmd, unsigned long arg)
721 tap_blkif_t *info = filp->private_data;
724 case BLKTAP_IOCTL_KICK_FE:
726 /* There are fe messages to process. */
727 return blktap_read_ufe_ring(info);
729 case BLKTAP_IOCTL_SETMODE:
732 if (BLKTAP_MODE_VALID(arg)) {
734 /* XXX: may need to flush rings here. */
735 DPRINTK("blktap: set mode to %lx\n",
742 case BLKTAP_IOCTL_PRINT_IDXS:
745 printk("User Rings: \n-----------\n");
746 printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
747 "| req_prod: %2d, rsp_prod: %2d\n",
748 info->ufe_ring.rsp_cons,
749 info->ufe_ring.req_prod_pvt,
750 info->ufe_ring.sring->req_prod,
751 info->ufe_ring.sring->rsp_prod);
755 case BLKTAP_IOCTL_SENDPID:
758 info->pid = (pid_t)arg;
759 DPRINTK("blktap: pid received %d\n",
764 case BLKTAP_IOCTL_NEWINTF:
766 uint64_t val = (uint64_t)arg;
767 domid_translate_t *tr = (domid_translate_t *)&val;
769 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
770 tr->domid, tr->busid);
771 info = get_next_free_dev();
773 WPRINTK("Error initialising /dev/xen/blktap - "
774 "No more devices\n");
777 info->trans.domid = tr->domid;
778 info->trans.busid = tr->busid;
781 case BLKTAP_IOCTL_FREEINTF:
783 unsigned long dev = arg;
788 if ((dev > MAX_TAP_DEV) || !info)
789 return 0; /* should this be an error? */
791 spin_lock_irqsave(&pending_free_lock, flags);
792 if (info->dev_pending)
793 info->dev_pending = 0;
794 spin_unlock_irqrestore(&pending_free_lock, flags);
798 case BLKTAP_IOCTL_MINOR:
800 unsigned long dev = arg;
804 if ((dev > MAX_TAP_DEV) || !info)
809 case BLKTAP_IOCTL_MAJOR:
812 case BLKTAP_QUERY_ALLOC_REQS:
814 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
815 alloc_pending_reqs, blkif_reqs);
816 return (alloc_pending_reqs/blkif_reqs) * 100;
822 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
824 tap_blkif_t *info = filp->private_data;
826 /* do not work on the control device */
830 poll_wait(filp, &info->wait, wait);
831 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
832 RING_PUSH_REQUESTS(&info->ufe_ring);
833 return POLLIN | POLLRDNORM;
838 void blktap_kick_user(int idx)
844 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
847 wake_up_interruptible(&info->wait);
852 static int do_block_io_op(blkif_t *blkif);
853 static void dispatch_rw_block_io(blkif_t *blkif,
854 blkif_request_t *req,
855 pending_req_t *pending_req);
856 static void make_response(blkif_t *blkif, u64 id,
857 unsigned short op, int st);
859 /******************************************************************
862 static int req_increase(void)
866 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
869 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
870 * blkif_reqs, GFP_KERNEL);
871 foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
873 if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
876 DPRINTK("%s: reqs=%d, pages=%d\n",
877 __FUNCTION__, blkif_reqs, mmap_pages);
879 for (i = 0; i < MAX_PENDING_REQS; i++) {
880 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
882 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
883 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
884 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
889 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
893 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
894 kfree(pending_reqs[mmap_alloc]);
895 WPRINTK("%s: out of memory\n", __FUNCTION__);
899 static void mmap_req_del(int mmap)
901 BUG_ON(!spin_is_locked(&pending_free_lock));
903 kfree(pending_reqs[mmap]);
904 pending_reqs[mmap] = NULL;
906 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
907 foreign_pages[mmap] = NULL;
910 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
914 static pending_req_t* alloc_req(void)
916 pending_req_t *req = NULL;
919 spin_lock_irqsave(&pending_free_lock, flags);
921 if (!list_empty(&pending_free)) {
922 req = list_entry(pending_free.next, pending_req_t, free_list);
923 list_del(&req->free_list);
928 alloc_pending_reqs++;
930 spin_unlock_irqrestore(&pending_free_lock, flags);
935 static void free_req(pending_req_t *req)
940 spin_lock_irqsave(&pending_free_lock, flags);
942 alloc_pending_reqs--;
944 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
946 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
947 spin_unlock_irqrestore(&pending_free_lock, flags);
950 was_empty = list_empty(&pending_free);
951 list_add(&req->free_list, &pending_free);
953 spin_unlock_irqrestore(&pending_free_lock, flags);
956 wake_up(&pending_free_wq);
959 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
962 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
963 unsigned int i, invcount = 0;
964 struct grant_handle_pair *khandle;
967 unsigned long kvaddr, uvaddr;
971 info = tapfds[tapidx];
973 if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
974 WPRINTK("fast_flush: Couldn't get info!\n");
978 if (info->vma != NULL &&
979 xen_feature(XENFEAT_auto_translated_physmap)) {
980 down_write(&info->vma->vm_mm->mmap_sem);
981 zap_page_range(info->vma,
982 MMAP_VADDR(info->user_vstart, u_idx, 0),
983 req->nr_pages << PAGE_SHIFT, NULL);
984 up_write(&info->vma->vm_mm->mmap_sem);
988 mmap_idx = req->mem_idx;
990 for (i = 0; i < req->nr_pages; i++) {
991 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
992 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
994 khandle = &pending_handle(mmap_idx, k_idx, i);
996 if (khandle->kernel != INVALID_GRANT_HANDLE) {
997 gnttab_set_unmap_op(&unmap[invcount],
998 idx_to_kaddr(mmap_idx, k_idx, i),
999 GNTMAP_host_map, khandle->kernel);
1002 set_phys_to_machine(
1003 __pa(idx_to_kaddr(mmap_idx, k_idx, i))
1004 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
1007 if (khandle->user != INVALID_GRANT_HANDLE) {
1008 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1009 if (create_lookup_pte_addr(
1011 MMAP_VADDR(info->user_vstart, u_idx, i),
1013 WPRINTK("Couldn't get a pte addr!\n");
1017 gnttab_set_unmap_op(&unmap[invcount], ptep,
1019 | GNTMAP_application_map
1020 | GNTMAP_contains_pte,
1025 BLKTAP_INVALIDATE_HANDLE(khandle);
1027 ret = HYPERVISOR_grant_table_op(
1028 GNTTABOP_unmap_grant_ref, unmap, invcount);
1031 if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
1032 zap_page_range(info->vma,
1033 MMAP_VADDR(info->user_vstart, u_idx, 0),
1034 req->nr_pages << PAGE_SHIFT, NULL);
1037 /******************************************************************
1038 * SCHEDULER FUNCTIONS
1041 static void print_stats(blkif_t *blkif)
1043 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
1044 current->comm, blkif->st_oo_req,
1045 blkif->st_rd_req, blkif->st_wr_req);
1046 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1047 blkif->st_rd_req = 0;
1048 blkif->st_wr_req = 0;
1049 blkif->st_oo_req = 0;
1052 int tap_blkif_schedule(void *arg)
1054 blkif_t *blkif = arg;
1059 printk(KERN_DEBUG "%s: started\n", current->comm);
1061 while (!kthread_should_stop()) {
1062 if (try_to_freeze())
1065 wait_event_interruptible(
1067 blkif->waiting_reqs || kthread_should_stop());
1068 wait_event_interruptible(
1070 !list_empty(&pending_free) || kthread_should_stop());
1072 blkif->waiting_reqs = 0;
1073 smp_mb(); /* clear flag *before* checking for work */
1075 if (do_block_io_op(blkif))
1076 blkif->waiting_reqs = 1;
1078 if (log_stats && time_after(jiffies, blkif->st_print))
1085 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1087 blkif->xenblkd = NULL;
1093 /******************************************************************
1094 * COMPLETION CALLBACK -- Called by user level ioctl()
1097 static int blktap_read_ufe_ring(tap_blkif_t *info)
1099 /* This is called to read responses from the UFE ring. */
1101 blkif_response_t *resp;
1102 blkif_t *blkif=NULL;
1103 int pending_idx, usr_idx, mmap_idx;
1104 pending_req_t *pending_req;
1109 /* We currently only forward packets in INTERCEPT_FE mode. */
1110 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1113 /* for each outstanding message on the UFEring */
1114 rp = info->ufe_ring.sring->rsp_prod;
1117 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1118 blkif_response_t res;
1119 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1120 memcpy(&res, resp, sizeof(res));
1121 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1122 ++info->ufe_ring.rsp_cons;
1124 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1125 usr_idx = (int)res.id;
1126 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1127 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1129 if ( (mmap_idx >= mmap_alloc) ||
1130 (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1131 WPRINTK("Incorrect req map"
1132 "[%d], internal map [%d,%d (%d)]\n",
1134 ID_TO_IDX(info->idx_map[usr_idx]),
1136 ID_TO_IDX(info->idx_map[usr_idx])));
1138 pending_req = &pending_reqs[mmap_idx][pending_idx];
1139 blkif = pending_req->blkif;
1141 for (j = 0; j < pending_req->nr_pages; j++) {
1143 unsigned long kvaddr, uvaddr;
1144 struct page **map = info->vma->vm_private_data;
1148 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1149 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
1151 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1152 ClearPageReserved(pg);
1153 offset = (uvaddr - info->vma->vm_start)
1157 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
1158 info->idx_map[usr_idx] = INVALID_REQ;
1159 make_response(blkif, pending_req->id, res.operation,
1161 blkif_put(pending_req->blkif);
1162 free_req(pending_req);
1169 /******************************************************************************
1170 * NOTIFICATION FROM GUEST OS.
1173 static void blkif_notify_work(blkif_t *blkif)
1175 blkif->waiting_reqs = 1;
1176 wake_up(&blkif->wq);
1179 irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
1181 blkif_notify_work(dev_id);
1187 /******************************************************************
1188 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1190 static int print_dbug = 1;
1191 static int do_block_io_op(blkif_t *blkif)
1193 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1194 blkif_request_t req;
1195 pending_req_t *pending_req;
1200 rc = blk_rings->common.req_cons;
1201 rp = blk_rings->common.sring->req_prod;
1202 rmb(); /* Ensure we see queued requests up to 'rp'. */
1204 /*Check blkif has corresponding UE ring*/
1205 if (blkif->dev_num < 0) {
1208 WPRINTK("Corresponding UE "
1209 "ring does not exist!\n");
1210 print_dbug = 0; /*We only print this message once*/
1215 info = tapfds[blkif->dev_num];
1217 if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
1219 WPRINTK("Can't get UE info!\n");
1227 if (RING_FULL(&info->ufe_ring)) {
1228 WPRINTK("RING_FULL! More to do\n");
1233 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1234 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1240 pending_req = alloc_req();
1241 if (NULL == pending_req) {
1247 if (kthread_should_stop()) {
1252 switch (blkif->blk_protocol) {
1253 case BLKIF_PROTOCOL_NATIVE:
1254 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1257 case BLKIF_PROTOCOL_X86_32:
1258 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1260 case BLKIF_PROTOCOL_X86_64:
1261 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1266 blk_rings->common.req_cons = ++rc; /* before make_response() */
1268 switch (req.operation) {
1271 dispatch_rw_block_io(blkif, &req, pending_req);
1274 case BLKIF_OP_WRITE:
1276 dispatch_rw_block_io(blkif, &req, pending_req);
1280 /* A good sign something is wrong: sleep for a while to
1281 * avoid excessive CPU consumption by a bad guest. */
1283 WPRINTK("unknown operation [%d]\n",
1285 make_response(blkif, req.id, req.operation,
1287 free_req(pending_req);
1291 /* Yield point for this unbounded loop. */
1295 blktap_kick_user(blkif->dev_num);
1300 static void dispatch_rw_block_io(blkif_t *blkif,
1301 blkif_request_t *req,
1302 pending_req_t *pending_req)
1304 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1305 int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1306 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1308 int ret, i, nr_sects = 0;
1310 blkif_request_t *target;
1311 int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1313 uint16_t mmap_idx = pending_req->mem_idx;
1315 if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
1318 info = tapfds[blkif->dev_num];
1322 /* Check we have space on user ring - should never fail. */
1323 usr_idx = GET_NEXT_REQ(info->idx_map);
1324 if (usr_idx == INVALID_REQ) {
1329 /* Check that number of segments is sane. */
1330 nseg = req->nr_segments;
1331 if ( unlikely(nseg == 0) ||
1332 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1333 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1337 /* Make sure userspace is ready. */
1338 if (!info->ring_ok) {
1339 WPRINTK("blktap: ring not ready for requests!\n");
1343 if (RING_FULL(&info->ufe_ring)) {
1344 WPRINTK("blktap: fe_ring is full, can't add "
1345 "IO Request will be dropped. %d %d\n",
1346 RING_SIZE(&info->ufe_ring),
1347 RING_SIZE(&blkif->blk_rings.common));
1351 pending_req->blkif = blkif;
1352 pending_req->id = req->id;
1353 pending_req->operation = operation;
1354 pending_req->status = BLKIF_RSP_OKAY;
1355 pending_req->nr_pages = nseg;
1357 for (i = 0; i < nseg; i++) {
1358 unsigned long uvaddr;
1359 unsigned long kvaddr;
1363 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1364 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1366 flags = GNTMAP_host_map;
1367 if (operation == WRITE)
1368 flags |= GNTMAP_readonly;
1369 gnttab_set_map_op(&map[op], kvaddr, flags,
1370 req->seg[i].gref, blkif->domid);
1373 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1374 /* Now map it to user. */
1375 ret = create_lookup_pte_addr(info->vma->vm_mm,
1378 WPRINTK("Couldn't get a pte addr!\n");
1382 flags = GNTMAP_host_map | GNTMAP_application_map
1383 | GNTMAP_contains_pte;
1384 if (operation == WRITE)
1385 flags |= GNTMAP_readonly;
1386 gnttab_set_map_op(&map[op], ptep, flags,
1387 req->seg[i].gref, blkif->domid);
1391 nr_sects += (req->seg[i].last_sect -
1392 req->seg[i].first_sect + 1);
1395 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1398 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1399 for (i = 0; i < (nseg*2); i+=2) {
1400 unsigned long uvaddr;
1401 unsigned long kvaddr;
1402 unsigned long offset;
1405 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1406 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1408 if (unlikely(map[i].status != 0)) {
1409 WPRINTK("invalid kernel buffer -- "
1410 "could not remap it\n");
1412 map[i].handle = INVALID_GRANT_HANDLE;
1415 if (unlikely(map[i+1].status != 0)) {
1416 WPRINTK("invalid user buffer -- "
1417 "could not remap it\n");
1419 map[i+1].handle = INVALID_GRANT_HANDLE;
1422 pending_handle(mmap_idx, pending_idx, i/2).kernel
1424 pending_handle(mmap_idx, pending_idx, i/2).user
1430 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1431 FOREIGN_FRAME(map[i].dev_bus_addr
1433 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1434 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1435 ((struct page **)info->vma->vm_private_data)[offset] =
1439 for (i = 0; i < nseg; i++) {
1440 unsigned long uvaddr;
1441 unsigned long kvaddr;
1442 unsigned long offset;
1445 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1446 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1448 if (unlikely(map[i].status != 0)) {
1449 WPRINTK("invalid kernel buffer -- "
1450 "could not remap it\n");
1452 map[i].handle = INVALID_GRANT_HANDLE;
1455 pending_handle(mmap_idx, pending_idx, i).kernel
1461 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1462 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1463 ((struct page **)info->vma->vm_private_data)[offset] =
1471 if (xen_feature(XENFEAT_auto_translated_physmap))
1472 down_write(&info->vma->vm_mm->mmap_sem);
1473 /* Mark mapped pages as reserved: */
1474 for (i = 0; i < req->nr_segments; i++) {
1475 unsigned long kvaddr;
1478 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1479 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1480 SetPageReserved(pg);
1481 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1482 ret = vm_insert_page(info->vma,
1483 MMAP_VADDR(info->user_vstart,
1486 up_write(&info->vma->vm_mm->mmap_sem);
1491 if (xen_feature(XENFEAT_auto_translated_physmap))
1492 up_write(&info->vma->vm_mm->mmap_sem);
1494 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1495 info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1498 /* Finally, write the request message to the user ring. */
1499 target = RING_GET_REQUEST(&info->ufe_ring,
1500 info->ufe_ring.req_prod_pvt);
1501 memcpy(target, req, sizeof(*req));
1502 target->id = usr_idx;
1503 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1504 info->ufe_ring.req_prod_pvt++;
1506 if (operation == READ)
1507 blkif->st_rd_sect += nr_sects;
1508 else if (operation == WRITE)
1509 blkif->st_wr_sect += nr_sects;
1514 WPRINTK("Reached Fail_flush\n");
1515 fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1517 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1518 free_req(pending_req);
1519 msleep(1); /* back off a bit */
1524 /******************************************************************
1525 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1529 static void make_response(blkif_t *blkif, u64 id,
1530 unsigned short op, int st)
1532 blkif_response_t resp;
1533 unsigned long flags;
1534 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1539 resp.operation = op;
1542 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1543 /* Place on the response ring for the relevant domain. */
1544 switch (blkif->blk_protocol) {
1545 case BLKIF_PROTOCOL_NATIVE:
1546 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1547 blk_rings->native.rsp_prod_pvt),
1548 &resp, sizeof(resp));
1550 case BLKIF_PROTOCOL_X86_32:
1551 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1552 blk_rings->x86_32.rsp_prod_pvt),
1553 &resp, sizeof(resp));
1555 case BLKIF_PROTOCOL_X86_64:
1556 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1557 blk_rings->x86_64.rsp_prod_pvt),
1558 &resp, sizeof(resp));
1563 blk_rings->common.rsp_prod_pvt++;
1564 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1566 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1568 * Tail check for pending requests. Allows frontend to avoid
1569 * notifications if requests are already in flight (lower
1570 * overheads and promotes batching).
1572 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1573 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1577 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1579 blkif_notify_work(blkif);
1581 notify_remote_via_irq(blkif->irq);
1584 static int __init blkif_init(void)
1587 struct class *class;
1589 if (!is_running_on_xen())
1592 INIT_LIST_HEAD(&pending_free);
1593 for(i = 0; i < 2; i++) {
1594 ret = req_increase();
1601 tap_blkif_interface_init();
1603 alloc_pending_reqs = 0;
1605 tap_blkif_xenbus_init();
1607 /* Dynamically allocate a major for this device */
1608 ret = register_chrdev(0, "blktap", &blktap_fops);
1611 WPRINTK("Couldn't register /dev/xen/blktap\n");
1617 /* tapfds[0] is always NULL */
1618 blktap_next_minor++;
1620 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1622 /* Make sure the xen class exists */
1623 if ((class = get_xen_class()) != NULL) {
1625 * This will allow udev to create the blktap ctrl device.
1626 * We only want to create blktap0 first. We don't want
1627 * to flood the sysfs system with needless blktap devices.
1628 * We only create the device when a request of a new device is
1631 class_device_create(class, NULL,
1632 MKDEV(blktap_major, 0), NULL,
1635 /* this is bad, but not fatal */
1636 WPRINTK("blktap: sysfs xen_class not created\n");
1639 DPRINTK("Blktap device successfully created\n");
1644 module_init(blkif_init);
1646 MODULE_LICENSE("Dual BSD/GPL");