1 #include <linux/module.h>
2 #include <linux/signal.h>
6 static int blktap_ring_major;
8 static inline struct blktap *
9 vma_to_blktap(struct vm_area_struct *vma)
11 struct vm_foreign_map *m = vma->vm_private_data;
12 struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
13 return container_of(r, struct blktap, ring);
17 * BLKTAP - immediately before the mmap area,
18 * we have a bunch of pages reserved for shared memory rings.
23 blktap_read_ring(struct blktap *tap)
25 /* This is called to read responses from the ring. */
29 struct blktap_ring *ring;
30 struct blktap_request *request;
32 down_read(&tap->tap_sem);
36 up_read(&tap->tap_sem);
40 /* for each outstanding message on the ring */
41 rp = ring->ring.sring->rsp_prod;
44 for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
45 memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
46 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
47 ++ring->ring.rsp_cons;
49 usr_idx = (int)res.id;
50 if (usr_idx >= MAX_PENDING_REQS ||
51 !tap->pending_requests[usr_idx]) {
52 BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
53 rc, rp, usr_idx, tap->pid, ring->vma);
57 request = tap->pending_requests[usr_idx];
58 BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
59 blktap_device_finish_request(tap, &res, request);
62 up_read(&tap->tap_sem);
64 blktap_run_deferred();
70 blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
73 * if the page has not been mapped in by the driver then return
74 * VM_FAULT_SIGBUS to the domain.
77 return VM_FAULT_SIGBUS;
81 blktap_ring_clear_pte(struct vm_area_struct *vma,
83 pte_t *ptep, int is_fullmm)
88 struct page **map, *page;
89 struct blktap_ring *ring;
90 struct blktap_request *request;
91 struct grant_handle_pair *khandle;
92 struct gnttab_unmap_grant_ref unmap[2];
93 int offset, seg, usr_idx, count = 0;
95 tap = vma_to_blktap(vma);
97 map = ring->foreign_map.map;
98 BUG_ON(!map); /* TODO Should this be changed to if statement? */
101 * Zap entry if the address is before the start of the grant
104 if (uvaddr < ring->user_vstart)
105 return xen_ptep_get_and_clear_full(vma, uvaddr,
108 offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
109 usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
110 seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
112 offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
114 if (page && PageBlkback(page)) {
115 ClearPageBlkback(page);
116 set_page_private(page, 0);
120 request = tap->pending_requests[usr_idx];
121 kvaddr = request_to_kaddr(request, seg);
122 khandle = request->handles + seg;
124 if (khandle->kernel != INVALID_GRANT_HANDLE) {
125 gnttab_set_unmap_op(&unmap[count], kvaddr,
126 GNTMAP_host_map, khandle->kernel);
129 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
134 if (khandle->user != INVALID_GRANT_HANDLE) {
135 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
138 gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep),
140 | GNTMAP_application_map
141 | GNTMAP_contains_pte,
145 copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
149 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
153 khandle->kernel = INVALID_GRANT_HANDLE;
154 khandle->user = INVALID_GRANT_HANDLE;
160 blktap_ring_vm_unmap(struct vm_area_struct *vma)
162 struct blktap *tap = vma_to_blktap(vma);
164 down_write(&tap->tap_sem);
165 clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
166 clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
167 clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
168 up_write(&tap->tap_sem);
172 blktap_ring_vm_close(struct vm_area_struct *vma)
174 struct blktap *tap = vma_to_blktap(vma);
175 struct blktap_ring *ring = &tap->ring;
177 blktap_ring_vm_unmap(vma); /* fail future requests */
178 blktap_device_fail_pending_requests(tap); /* fail pending requests */
179 blktap_device_restart(tap); /* fail deferred requests */
181 down_write(&tap->tap_sem);
183 zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
185 kfree(ring->foreign_map.map);
186 ring->foreign_map.map = NULL;
188 /* Free the ring page. */
189 ClearPageReserved(virt_to_page(ring->ring.sring));
190 free_page((unsigned long)ring->ring.sring);
192 BTINFO("unmapping ring %d\n", tap->minor);
193 ring->ring.sring = NULL;
196 up_write(&tap->tap_sem);
201 static struct vm_operations_struct blktap_ring_vm_operations = {
202 .close = blktap_ring_vm_close,
203 .unmap = blktap_ring_vm_unmap,
204 .fault = blktap_ring_fault,
205 .zap_pte = blktap_ring_clear_pte,
209 blktap_ring_open(struct inode *inode, struct file *filp)
215 if (idx < 0 || idx >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[idx]) {
216 BTERR("unable to open device blktap%d\n", idx);
222 BTINFO("opening device blktap%d\n", idx);
224 if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
227 /* Only one process can access ring at a time */
228 if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
231 filp->private_data = tap;
232 BTINFO("opened device %d\n", tap->minor);
238 blktap_ring_release(struct inode *inode, struct file *filp)
240 struct blktap *tap = filp->private_data;
242 BTINFO("freeing device %d\n", tap->minor);
243 clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
244 filp->private_data = NULL;
250 * We need to map pages to user space in a way that will allow the block
251 * subsystem set up direct IO to them. This couldn't be done before, because
252 * there isn't really a sane way to translate a user virtual address down to a
253 * physical address when the page belongs to another domain.
255 * My first approach was to map the page in to kernel memory, add an entry
256 * for it in the physical frame list (using alloc_lomem_region as in blkback)
257 * and then attempt to map that page up to user space. This is disallowed
258 * by xen though, which realizes that we don't really own the machine frame
259 * underlying the physical page.
261 * The new approach is to provide explicit support for this in xen linux.
262 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
263 * mapped from other vms. vma->vm_private_data is set up as a mapping
264 * from pages to actual page structs. There is a new clause in get_user_pages
265 * that does the right thing for this sort of mapping.
268 blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
273 blkif_sring_t *sring;
274 struct blktap_ring *ring;
276 tap = filp->private_data;
281 if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
284 size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
285 if (size != (MMAP_PAGES + RING_PAGES)) {
286 BTERR("you _must_ map exactly %lu pages!\n",
287 MMAP_PAGES + RING_PAGES);
291 /* Allocate the fe ring. */
292 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
294 BTERR("Couldn't alloc sring.\n");
298 map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
300 BTERR("Couldn't alloc VM_FOREIGN map.\n");
304 SetPageReserved(virt_to_page(sring));
306 SHARED_RING_INIT(sring);
307 FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
309 ring->ring_vstart = vma->vm_start;
310 ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
312 /* Map the ring pages to the start of the region and reserve it. */
313 if (xen_feature(XENFEAT_auto_translated_physmap))
314 err = vm_insert_page(vma, vma->vm_start,
315 virt_to_page(ring->ring.sring));
317 err = remap_pfn_range(vma, vma->vm_start,
318 __pa(ring->ring.sring) >> PAGE_SHIFT,
319 PAGE_SIZE, vma->vm_page_prot);
321 BTERR("Mapping user ring failed: %d\n", err);
325 /* Mark this VM as containing foreign pages, and set up mappings. */
326 ring->foreign_map.map = map;
327 vma->vm_private_data = &ring->foreign_map;
328 vma->vm_flags |= VM_FOREIGN;
329 vma->vm_flags |= VM_DONTCOPY;
330 vma->vm_flags |= VM_RESERVED;
331 vma->vm_ops = &blktap_ring_vm_operations;
334 vma->vm_mm->context.has_foreign_mappings = 1;
337 tap->pid = current->pid;
338 BTINFO("blktap: mapping pid is %d\n", tap->pid);
344 /* Clear any active mappings. */
345 zap_page_range(vma, vma->vm_start,
346 vma->vm_end - vma->vm_start, NULL);
347 ClearPageReserved(virt_to_page(sring));
349 free_page((unsigned long)sring);
356 blktap_ring_set_message(struct blktap *tap, int msg)
358 struct blktap_ring *ring = &tap->ring;
360 down_read(&tap->tap_sem);
361 if (ring->ring.sring)
362 ring->ring.sring->private.tapif_user.msg = msg;
363 up_read(&tap->tap_sem);
367 blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
369 struct blktap_params params;
370 struct blktap *tap = filp->private_data;
372 BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
375 case BLKTAP2_IOCTL_KICK_FE:
376 /* There are fe messages to process. */
377 return blktap_read_ring(tap);
379 case BLKTAP2_IOCTL_CREATE_DEVICE:
383 if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
385 BTERR("failed to get params\n");
389 if (blktap_validate_params(tap, ¶ms)) {
390 BTERR("invalid params\n");
394 tap->params = params;
395 return blktap_device_create(tap);
397 case BLKTAP2_IOCTL_SET_PARAMS:
401 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
404 if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
406 BTERR("failed to get params\n");
410 if (blktap_validate_params(tap, ¶ms)) {
411 BTERR("invalid params\n");
415 tap->params = params;
418 case BLKTAP2_IOCTL_PAUSE:
419 if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
422 set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
423 clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
425 blktap_ring_set_message(tap, 0);
426 wake_up_interruptible(&tap->wq);
431 case BLKTAP2_IOCTL_REOPEN:
432 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
438 if (copy_to_user((char __user *)arg,
440 strlen(tap->params.name) + 1))
443 blktap_ring_set_message(tap, 0);
444 wake_up_interruptible(&tap->wq);
448 case BLKTAP2_IOCTL_RESUME:
449 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
452 tap->ring.response = (int)arg;
453 if (!tap->ring.response)
454 clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
456 blktap_ring_set_message(tap, 0);
457 wake_up_interruptible(&tap->wq);
465 static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
467 struct blktap *tap = filp->private_data;
468 struct blktap_ring *ring = &tap->ring;
470 poll_wait(filp, &ring->poll_wait, wait);
471 if (ring->ring.sring->private.tapif_user.msg ||
472 ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
473 RING_PUSH_REQUESTS(&ring->ring);
474 return POLLIN | POLLRDNORM;
480 static const struct file_operations blktap_ring_file_operations = {
481 .owner = THIS_MODULE,
482 .open = blktap_ring_open,
483 .release = blktap_ring_release,
484 .unlocked_ioctl = blktap_ring_ioctl,
485 .mmap = blktap_ring_mmap,
486 .poll = blktap_ring_poll,
490 blktap_ring_kick_user(struct blktap *tap)
492 wake_up_interruptible(&tap->ring.poll_wait);
496 blktap_ring_resume(struct blktap *tap)
499 struct blktap_ring *ring = &tap->ring;
501 if (!blktap_active(tap))
504 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
507 /* set shared flag for resume */
510 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
511 blktap_ring_kick_user(tap);
513 wait_event_interruptible(tap->wq, ring->response ||
514 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
516 err = ring->response;
519 BTDBG("err: %d\n", err);
524 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
531 blktap_ring_pause(struct blktap *tap)
533 if (!blktap_active(tap))
536 if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
539 BTDBG("draining queue\n");
540 wait_event_interruptible(tap->wq, !tap->pending_cnt);
541 if (tap->pending_cnt)
544 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
545 blktap_ring_kick_user(tap);
547 BTDBG("waiting for tapdisk response\n");
548 wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
549 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
556 blktap_ring_destroy(struct blktap *tap)
558 if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
559 !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
562 BTDBG("sending tapdisk close message\n");
563 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
564 blktap_ring_kick_user(tap);
570 blktap_ring_initialize(struct blktap_ring *ring, int minor)
572 memset(ring, 0, sizeof(*ring));
573 init_waitqueue_head(&ring->poll_wait);
574 ring->devno = MKDEV(blktap_ring_major, minor);
578 blktap_ring_create(struct blktap *tap)
580 struct blktap_ring *ring = &tap->ring;
581 blktap_ring_initialize(ring, tap->minor);
582 return blktap_sysfs_create(tap);
586 blktap_ring_init(int *major)
590 err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
591 &blktap_ring_file_operations);
593 BTERR("error registering blktap ring device: %d\n", err);
597 blktap_ring_major = *major = err;
598 BTINFO("blktap ring major: %d\n", blktap_ring_major);
603 blktap_ring_free(void)
605 if (blktap_ring_major)
606 __unregister_chrdev(blktap_ring_major, 0,
607 CONFIG_XEN_NR_TAP2_DEVICES, "blktap2");