Added patch headers.
[linux-flexiantxendom0-3.2.10.git] / drivers / xen / blktap2 / device.c
1 #include <linux/fs.h>
2 #include <linux/blkdev.h>
3 #include <linux/cdrom.h>
4 #include <linux/hdreg.h>
5 #include <linux/module.h>
6 #include <linux/version.h>
7 #include <asm/tlbflush.h>
8
9 #include <scsi/scsi.h>
10 #include <scsi/scsi_ioctl.h>
11
12 #include <xen/xenbus.h>
13 #include <xen/interface/io/blkif.h>
14
15 #include "blktap.h"
16
17 #include "../blkback/blkback-pagemap.h"
18
19 #if 0
20 #define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
21 #else
22 #define DPRINTK_IOCTL(_f, _a...) ((void)0)
23 #endif
24
25 struct blktap_grant_table {
26         int cnt;
27         struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
28 };
29
30 static int blktap_device_major;
31
32 static inline struct blktap *
33 dev_to_blktap(struct blktap_device *dev)
34 {
35         return container_of(dev, struct blktap, device);
36 }
37
38 static int
39 blktap_device_open(struct block_device *bd, fmode_t mode)
40 {
41         struct blktap *tap;
42         struct blktap_device *dev = bd->bd_disk->private_data;
43
44         if (!dev)
45                 return -ENOENT;
46
47         tap = dev_to_blktap(dev);
48         if (!blktap_active(tap) ||
49             test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
50                 return -ENOENT;
51
52         dev->users++;
53
54         return 0;
55 }
56
57 static int
58 blktap_device_release(struct gendisk *disk, fmode_t mode)
59 {
60         struct blktap_device *dev = disk->private_data;
61         struct blktap *tap = dev_to_blktap(dev);
62
63         dev->users--;
64         if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
65                 blktap_device_destroy(tap);
66
67         return 0;
68 }
69
70 static int
71 blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
72 {
73         /* We don't have real geometry info, but let's at least return
74            values consistent with the size of the device */
75         sector_t nsect = get_capacity(bd->bd_disk);
76         sector_t cylinders = nsect;
77
78         hg->heads = 0xff;
79         hg->sectors = 0x3f;
80         sector_div(cylinders, hg->heads * hg->sectors);
81         hg->cylinders = cylinders;
82         if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
83                 hg->cylinders = 0xffff;
84         return 0;
85 }
86
87 static int
88 blktap_device_ioctl(struct block_device *bd, fmode_t mode,
89                     unsigned command, unsigned long argument)
90 {
91         int i;
92
93         DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx\n",
94                       command, (long)argument);
95
96         switch (command) {
97 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
98         case HDIO_GETGEO: {
99                 struct hd_geometry geo;
100                 int ret;
101
102                 if (!argument)
103                         return -EINVAL;
104
105                 geo.start = get_start_sect(bd);
106                 ret = blktap_device_getgeo(bd, &geo);
107                 if (ret)
108                         return ret;
109
110                 if (copy_to_user((struct hd_geometry __user *)argument, &geo,
111                                  sizeof(geo)))
112                         return -EFAULT;
113
114                 return 0;
115         }
116 #endif
117         case CDROMMULTISESSION:
118                 BTDBG("FIXME: support multisession CDs later\n");
119                 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
120                         if (put_user(0, (char __user *)(argument + i)))
121                                 return -EFAULT;
122                 return 0;
123
124         case SCSI_IOCTL_GET_IDLUN:
125                 if (!access_ok(VERIFY_WRITE, argument, 
126                         sizeof(struct scsi_idlun)))
127                         return -EFAULT;
128
129                 /* return 0 for now. */
130                 __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
131                 __put_user(0, 
132                         &((struct scsi_idlun __user *)argument)->host_unique_id);
133                 return 0;
134
135         default:
136                 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
137                   command);*/
138                 return -EINVAL; /* same return as native Linux */
139         }
140
141         return 0;
142 }
143
144 static const struct block_device_operations blktap_device_file_operations = {
145         .owner     = THIS_MODULE,
146         .open      = blktap_device_open,
147         .release   = blktap_device_release,
148         .ioctl     = blktap_device_ioctl,
149 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
150         .getgeo    = blktap_device_getgeo
151 #endif
152 };
153
154 static int
155 blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
156                     unsigned long addr, void *data)
157 {
158         pte_t *pte = (pte_t *)data;
159
160         BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
161         set_pte(ptep, *pte);
162         return 0;
163 }
164
165 static int
166 blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte)
167 {
168         return apply_to_page_range(vma ? vma->vm_mm : NULL, address,
169                                    PAGE_SIZE, blktap_map_uaddr_fn, &pte);
170 }
171
172 static int
173 blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
174                      unsigned long addr, void *data)
175 {
176         struct vm_area_struct *vma = data;
177
178         BTDBG("ptep %p\n", ptep);
179         xen_ptep_get_and_clear_full(vma, addr, ptep, 1);
180         return 0;
181 }
182
183 static int
184 blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address)
185 {
186         struct mm_struct *mm = NULL;
187
188         if (!vma) {
189 #ifdef CONFIG_X86
190                 if (HYPERVISOR_update_va_mapping(address, __pte(0),
191                                                  UVMF_INVLPG|UVMF_ALL))
192                         BUG();
193                 return 1;
194 #endif
195         } else
196                 mm = vma->vm_mm;
197         return apply_to_page_range(mm, address,
198                                    PAGE_SIZE, blktap_umap_uaddr_fn, vma);
199 }
200
201 static inline void
202 flush_tlb_kernel_page(unsigned long kvaddr)
203 {
204 #ifdef CONFIG_X86
205         xen_invlpg_all(kvaddr);
206 #else
207         flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
208 #endif
209 }
210
211 /*
212  * tap->tap_sem held on entry
213  */
214 static void
215 blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
216 {
217         uint64_t ptep;
218         int ret, usr_idx;
219         unsigned int i, cnt;
220         struct page **map, *page;
221         struct blktap_ring *ring;
222         struct grant_handle_pair *khandle;
223         unsigned long kvaddr, uvaddr, offset;
224         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
225         grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
226         int self_gref_nr = 0;
227
228         cnt     = 0;
229         ring    = &tap->ring;
230         usr_idx = request->usr_idx;
231         map     = ring->foreign_map.map;
232
233         if (!ring->vma)
234                 return;
235
236         if (xen_feature(XENFEAT_auto_translated_physmap))
237                 zap_page_range(ring->vma, 
238                                MMAP_VADDR(ring->user_vstart, usr_idx, 0),
239                                request->nr_pages << PAGE_SHIFT, NULL);
240
241         for (i = 0; i < request->nr_pages; i++) {
242                 kvaddr = request_to_kaddr(request, i);
243                 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
244
245                 khandle = request->handles + i;
246
247                 if (khandle->kernel != INVALID_GRANT_HANDLE) {
248                         gnttab_set_unmap_op(&unmap[cnt], kvaddr,
249                                             GNTMAP_host_map, khandle->kernel);
250                         cnt++;
251                         set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
252                                             INVALID_P2M_ENTRY);
253                 }
254
255                 if (khandle->user != INVALID_GRANT_HANDLE) {
256                         BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
257                         if (create_lookup_pte_addr(ring->vma->vm_mm,
258                                                    uvaddr, &ptep) != 0) {
259                                 BTERR("Couldn't get a pte addr!\n");
260                                 return;
261                         }
262
263                         gnttab_set_unmap_op(&unmap[cnt], ptep,
264                                             GNTMAP_host_map
265                                             | GNTMAP_application_map
266                                             | GNTMAP_contains_pte,
267                                             khandle->user);
268                         cnt++;
269                 }
270
271                 offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
272
273                 BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
274                       "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
275                       "0x%08lx, handle: %u\n", offset, map[offset], request,
276                       usr_idx, i, kvaddr, khandle->kernel, uvaddr,
277                       khandle->user);
278
279                 page = map[offset];
280                 if (page) {
281                         ClearPageReserved(map[offset]);
282                         if (PageBlkback(page)) {
283                                 ClearPageBlkback(page);
284                                 set_page_private(page, 0);
285                         } else if (
286                                 xen_feature(XENFEAT_auto_translated_physmap)) {
287                                 self_gref[self_gref_nr] = khandle->kernel;
288                                 self_gref_nr++;
289                         }
290                 }
291                 map[offset] = NULL;
292
293                 khandle->kernel = INVALID_GRANT_HANDLE;
294                 khandle->user   = INVALID_GRANT_HANDLE;
295         }
296
297         if (cnt) {
298                 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
299                                                 unmap, cnt);
300                 BUG_ON(ret);
301         }
302
303         if (!xen_feature(XENFEAT_auto_translated_physmap))
304                 zap_page_range(ring->vma, 
305                                MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
306                                request->nr_pages << PAGE_SHIFT, NULL);
307         else {
308                 for (i = 0; i < self_gref_nr; i++) {
309                         gnttab_end_foreign_access_ref(self_gref[i]);
310                 }
311         }
312 }
313
314 /*
315  * tap->tap_sem held on entry
316  */
317 static void
318 blktap_unmap(struct blktap *tap, struct blktap_request *request)
319 {
320         int i, usr_idx;
321         unsigned long kvaddr;
322
323         usr_idx = request->usr_idx;
324         down_write(&tap->ring.vma->vm_mm->mmap_sem);
325
326         for (i = 0; i < request->nr_pages; i++) {
327                 kvaddr = request_to_kaddr(request, i);
328                 BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
329                       "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
330                       kvaddr, request->handles[i].kernel,
331                       MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
332                       request->handles[i].user);
333
334                 if (!xen_feature(XENFEAT_auto_translated_physmap) &&
335                     request->handles[i].kernel == INVALID_GRANT_HANDLE) {
336                         if (blktap_umap_uaddr(NULL, kvaddr) == 0)
337                                 flush_tlb_kernel_page(kvaddr);
338                         set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
339                                             INVALID_P2M_ENTRY);
340                 }
341         }
342
343         blktap_device_fast_flush(tap, request);
344         up_write(&tap->ring.vma->vm_mm->mmap_sem);
345 }
346
347 /*
348  * called if the tapdisk process dies unexpectedly.
349  * fail and release any pending requests and disable queue.
350  */
351 void
352 blktap_device_fail_pending_requests(struct blktap *tap)
353 {
354         int usr_idx;
355         struct request *req;
356         struct blktap_device *dev;
357         struct blktap_request *request;
358
359         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
360                 return;
361
362         down_write(&tap->tap_sem);
363
364         dev = &tap->device;
365         for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
366                 request = tap->pending_requests[usr_idx];
367                 if (!request || request->status != BLKTAP_REQUEST_PENDING)
368                         continue;
369
370                 BTERR("%u:%u: failing pending %s of %d pages\n",
371                       blktap_device_major, tap->minor,
372                       (request->operation == BLKIF_OP_PACKET ?
373                        "packet" : request->operation == BLKIF_OP_READ ?
374                        "read" : "write"), request->nr_pages);
375
376                 blktap_unmap(tap, request);
377                 req = (struct request *)(unsigned long)request->id;
378                 blk_end_request_all(req, -ENODEV);
379                 blktap_request_free(tap, request);
380         }
381
382         up_write(&tap->tap_sem);
383
384         spin_lock_irq(&dev->lock);
385
386         /* fail any future requests */
387         dev->gd->queue->queuedata = NULL;
388         blk_start_queue(dev->gd->queue);
389
390         spin_unlock_irq(&dev->lock);
391 }
392
393 /*
394  * tap->tap_sem held on entry
395  */
396 void
397 blktap_device_finish_request(struct blktap *tap,
398                              blkif_response_t *res,
399                              struct blktap_request *request)
400 {
401         struct request *req;
402
403         blktap_unmap(tap, request);
404
405         req = (struct request *)(unsigned long)request->id;
406
407         BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
408               res->status, res->operation, request->operation,
409               (unsigned long long)res->id);
410
411         switch (request->operation) {
412         case BLKIF_OP_READ:
413         case BLKIF_OP_WRITE:
414         case BLKIF_OP_PACKET:
415                 if (unlikely(res->status != BLKIF_RSP_OKAY))
416                         BTERR("Bad return from device data "
417                                 "request: %x\n", res->status);
418                 blk_end_request_all(req,
419                         res->status == BLKIF_RSP_OKAY ? 0 : -EIO);
420                 break;
421         default:
422                 BUG();
423         }
424
425         blktap_request_free(tap, request);
426 }
427
428 static int
429 blktap_prep_foreign(struct blktap *tap,
430                     struct blktap_request *request,
431                     blkif_request_t *blkif_req,
432                     unsigned int seg, struct page *page,
433                     struct blktap_grant_table *table)
434 {
435         uint64_t ptep;
436         uint32_t flags;
437         struct page *tap_page;
438         struct blktap_ring *ring;
439         struct blkback_pagemap map;
440         unsigned long uvaddr, kvaddr;
441
442         ring = &tap->ring;
443         map  = blkback_pagemap_read(page);
444         blkif_req->seg[seg].gref = map.gref;
445
446         uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
447         kvaddr = request_to_kaddr(request, seg);
448         flags  = GNTMAP_host_map |
449                 (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
450
451         gnttab_set_map_op(&table->grants[table->cnt],
452                           kvaddr, flags, map.gref, map.domid);
453         table->cnt++;
454
455         /* enable chained tap devices */
456         tap_page = request_to_page(request, seg);
457         set_page_private(tap_page, page_private(page));
458         SetPageBlkback(tap_page);
459
460         if (xen_feature(XENFEAT_auto_translated_physmap))
461                 return 0;
462
463         if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
464                 BTERR("couldn't get a pte addr!\n");
465                 return -1;
466         }
467
468         flags |= GNTMAP_application_map | GNTMAP_contains_pte;
469         gnttab_set_map_op(&table->grants[table->cnt],
470                           ptep, flags, map.gref, map.domid);
471         table->cnt++;
472
473         return 0;
474 }
475
476 static int
477 blktap_map_foreign(struct blktap *tap,
478                    struct blktap_request *request,
479                    blkif_request_t *blkif_req,
480                    struct blktap_grant_table *table)
481 {
482         struct page *page;
483         int i, grant, err, usr_idx;
484         struct blktap_ring *ring;
485         unsigned long uvaddr, foreign_mfn;
486
487         if (!table->cnt)
488                 return 0;
489
490         err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
491                                         table->grants, table->cnt);
492         BUG_ON(err);
493
494         grant   = 0;
495         usr_idx = request->usr_idx;
496         ring    = &tap->ring;
497
498         for (i = 0; i < request->nr_pages; i++) {
499                 if (!blkif_req->seg[i].gref)
500                         continue;
501
502                 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
503
504                 if (unlikely(table->grants[grant].status)) {
505                         BTERR("invalid kernel buffer: could not remap it\n");
506             /* This should never happen: blkback should handle eagain first */
507             BUG_ON(table->grants[grant].status == GNTST_eagain);
508                         err |= 1;
509                         table->grants[grant].handle = INVALID_GRANT_HANDLE;
510                 }
511
512                 request->handles[i].kernel = table->grants[grant].handle;
513                 foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
514                 grant++;
515
516                 if (xen_feature(XENFEAT_auto_translated_physmap))
517                         goto done;
518
519                 if (unlikely(table->grants[grant].status)) {
520                         BTERR("invalid user buffer: could not remap it\n");
521                         err |= 1;
522                         table->grants[grant].handle = INVALID_GRANT_HANDLE;
523                 }
524
525                 request->handles[i].user = table->grants[grant].handle;
526                 grant++;
527
528         done:
529                 if (err)
530                         continue;
531
532                 page = request_to_page(request, i);
533
534                 if (!xen_feature(XENFEAT_auto_translated_physmap))
535                         set_phys_to_machine(page_to_pfn(page),
536                                             FOREIGN_FRAME(foreign_mfn));
537                 else if (vm_insert_page(ring->vma, uvaddr, page))
538                         err |= 1;
539
540                 BTDBG("pending_req: %p, seg: %d, page: %p, "
541                       "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
542                       "uhandle: %u\n", request, i, page,
543                       pfn_to_kaddr(page_to_pfn(page)),
544                       request->handles[i].kernel,
545                       uvaddr, request->handles[i].user);
546         }
547
548         return err;
549 }
550
551 static int
552 blktap_map(struct blktap *tap,
553            struct blktap_request *request,
554            unsigned int seg, struct page *page)
555 {
556         pte_t pte;
557         int usr_idx;
558         struct blktap_ring *ring;
559         unsigned long uvaddr, kvaddr;
560         int err = 0;
561
562         ring    = &tap->ring;
563         usr_idx = request->usr_idx;
564         uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
565         kvaddr  = request_to_kaddr(request, seg);
566
567         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
568                 pte = mk_pte(page, ring->vma->vm_page_prot);
569                 blktap_map_uaddr(ring->vma, uvaddr,
570                                  pte_mkspecial(pte_mkwrite(pte)));
571                 flush_tlb_page(ring->vma, uvaddr);
572                 blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
573                 flush_tlb_kernel_page(kvaddr);
574
575                 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
576                 request->handles[seg].kernel = INVALID_GRANT_HANDLE;
577         } else {
578                 /* grant this page access to self domain and map it. */
579                 domid_t domid = 0; /* XXX my domian id: grant table hypercall
580                                       doesn't understand DOMID_SELF */
581                 int gref;
582                 uint32_t flags;
583                 struct gnttab_map_grant_ref map;
584                 struct page *tap_page;
585
586                 gref = gnttab_grant_foreign_access(
587                         domid, page_to_pfn(page),
588                         (request->operation == BLKIF_OP_WRITE)?
589                         GTF_readonly: 0);
590
591                 flags  = GNTMAP_host_map |
592                         (request->operation == BLKIF_OP_WRITE ?
593                          GNTMAP_readonly : 0);
594
595                 gnttab_set_map_op(&map, kvaddr, flags, gref, domid);
596
597                 /* enable chained tap devices */
598                 tap_page = request_to_page(request, seg);
599                 set_page_private(tap_page, page_private(page));
600                 SetPageBlkback(tap_page);
601
602                 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
603                                                 &map, 1);
604                 BUG_ON(err);
605         /* We are not expecting the grant op to fail */
606         BUG_ON(map.status != GNTST_okay);
607
608                 err = vm_insert_page(ring->vma, uvaddr, tap_page);
609                 if (err) {
610                         struct gnttab_unmap_grant_ref unmap;
611                         gnttab_set_unmap_op(&unmap, kvaddr,
612                                             GNTMAP_host_map, gref);
613                         VOID(HYPERVISOR_grant_table_op(
614                                 GNTTABOP_unmap_grant_ref, &unmap, 1));
615                 } else
616                         request->handles[seg].kernel = gref;
617         }
618         request->handles[seg].user = INVALID_GRANT_HANDLE;
619
620         BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
621               "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
622               uvaddr);
623
624         return err;
625 }
626
627 static int
628 blktap_device_process_request(struct blktap *tap,
629                               struct blktap_request *request,
630                               struct request *req)
631 {
632         struct page *page;
633         int i, usr_idx, err;
634         struct blktap_ring *ring;
635         struct scatterlist *sg;
636         struct blktap_grant_table table;
637         unsigned int fsect, lsect, nr_sects;
638         unsigned long offset, uvaddr;
639         struct blkif_request blkif_req, *target;
640
641         err = -1;
642         memset(&table, 0, sizeof(table));
643
644         if (!blktap_active(tap))
645                 goto out;
646
647         ring    = &tap->ring;
648         usr_idx = request->usr_idx;
649         blkif_req.id = usr_idx;
650         blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
651         blkif_req.handle = 0;
652         blkif_req.operation = rq_data_dir(req) ?
653                 BLKIF_OP_WRITE : BLKIF_OP_READ;
654         if (unlikely(blk_pc_request(req)))
655                 blkif_req.operation = BLKIF_OP_PACKET;
656
657         request->id        = (unsigned long)req;
658         request->operation = blkif_req.operation;
659         request->status    = BLKTAP_REQUEST_PENDING;
660         do_gettimeofday(&request->time);
661
662         nr_sects = 0;
663         request->nr_pages = 0;
664         blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
665         BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
666         for_each_sg(tap->sg, sg, blkif_req.nr_segments, i) {
667                         fsect = sg->offset >> 9;
668                         lsect = fsect + (sg->length >> 9) - 1;
669                         nr_sects += sg->length >> 9;
670
671                         blkif_req.seg[i] =
672                                 (struct blkif_request_segment) {
673                                 .gref       = 0,
674                                 .first_sect = fsect,
675                                 .last_sect  = lsect };
676
677                         if (PageBlkback(sg_page(sg))) {
678                                 /* foreign page -- use xen */
679                                 if (blktap_prep_foreign(tap,
680                                                         request,
681                                                         &blkif_req,
682                                                         i,
683                                                         sg_page(sg),
684                                                         &table))
685                                         goto out;
686                         } else {
687                                 /* do it the old fashioned way */
688                                 if (blktap_map(tap,
689                                                request,
690                                                i,
691                                                sg_page(sg)))
692                                         goto out;
693                         }
694
695                         uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
696                         offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
697                         page   = request_to_page(request, i);
698                         ring->foreign_map.map[offset] = page;
699                         SetPageReserved(page);
700
701                         BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
702                               uvaddr, page, page_to_pfn(page));
703                         BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
704                               "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
705                               offset, request, i,
706                               page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
707
708                         request->nr_pages++;
709         }
710
711         if (blktap_map_foreign(tap, request, &blkif_req, &table))
712                 goto out;
713
714         /* Finally, write the request message to the user ring. */
715         target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
716         memcpy(target, &blkif_req, sizeof(blkif_req));
717         target->id = request->usr_idx;
718         wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
719         ring->ring.req_prod_pvt++;
720
721         if (unlikely(blk_pc_request(req)))
722                 tap->stats.st_pk_req++;
723         else if (rq_data_dir(req)) {
724                 tap->stats.st_wr_sect += nr_sects;
725                 tap->stats.st_wr_req++;
726         } else {
727                 tap->stats.st_rd_sect += nr_sects;
728                 tap->stats.st_rd_req++;
729         }
730
731         err = 0;
732
733 out:
734         if (err)
735                 blktap_device_fast_flush(tap, request);
736         return err;
737 }
738
739 #ifdef ENABLE_PASSTHROUGH
740 #define rq_for_each_bio_safe(_bio, _tmp, _req)                          \
741         if ((_req)->bio)                                                \
742                 for (_bio = (_req)->bio;                                \
743                      _bio && ((_tmp = _bio->bi_next) || 1);             \
744                      _bio = _tmp)
745
746 static void
747 blktap_device_forward_request(struct blktap *tap, struct request *req)
748 {
749         struct bio *bio, *tmp;
750         struct blktap_device *dev;
751
752         dev = &tap->device;
753
754         rq_for_each_bio_safe(bio, tmp, req) {
755                 bio->bi_bdev = dev->bdev;
756                 submit_bio(bio->bi_rw, bio);
757         }
758 }
759
760 static void
761 blktap_device_close_bdev(struct blktap *tap)
762 {
763         struct blktap_device *dev;
764
765         dev = &tap->device;
766
767         if (dev->bdev)
768                 blkdev_put(dev->bdev);
769
770         dev->bdev = NULL;
771         clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
772 }
773
774 static int
775 blktap_device_open_bdev(struct blktap *tap, u32 pdev)
776 {
777         struct block_device *bdev;
778         struct blktap_device *dev;
779
780         dev = &tap->device;
781
782         bdev = open_by_devnum(pdev, FMODE_WRITE);
783         if (IS_ERR(bdev)) {
784                 BTERR("opening device %x:%x failed: %ld\n",
785                       MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
786                 return PTR_ERR(bdev);
787         }
788
789         if (!bdev->bd_disk) {
790                 BTERR("device %x:%x doesn't exist\n",
791                       MAJOR(pdev), MINOR(pdev));
792                 blkdev_put(dev->bdev);
793                 return -ENOENT;
794         }
795
796         dev->bdev = bdev;
797         set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
798
799         /* TODO: readjust queue parameters */
800
801         BTINFO("set device %d to passthrough on %x:%x\n",
802                tap->minor, MAJOR(pdev), MINOR(pdev));
803
804         return 0;
805 }
806
807 int
808 blktap_device_enable_passthrough(struct blktap *tap,
809                                  unsigned major, unsigned minor)
810 {
811         u32 pdev;
812         struct blktap_device *dev;
813
814         dev  = &tap->device;
815         pdev = MKDEV(major, minor);
816
817         if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
818                 return -EINVAL;
819
820         if (dev->bdev) {
821                 if (pdev)
822                         return -EINVAL;
823                 blktap_device_close_bdev(tap);
824                 return 0;
825         }
826
827         return blktap_device_open_bdev(tap, pdev);
828 }
829 #endif
830
831 /*
832  * dev->lock held on entry
833  */
834 static void
835 blktap_device_run_queue(struct blktap *tap)
836 {
837         int queued, err;
838         struct request_queue *rq;
839         struct request *req;
840         struct blktap_ring *ring;
841         struct blktap_device *dev;
842         struct blktap_request *request;
843
844         queued = 0;
845         ring   = &tap->ring;
846         dev    = &tap->device;
847         rq     = dev->gd->queue;
848
849         BTDBG("running queue for %d\n", tap->minor);
850
851         while ((req = blk_peek_request(rq)) != NULL) {
852                 if (RING_FULL(&ring->ring)) {
853                 wait:
854                         /* Avoid pointless unplugs. */
855                         blk_stop_queue(rq);
856                         blktap_defer(tap);
857                         break;
858                 }
859
860                 blk_start_request(req);
861
862                 if (!blk_fs_request(req)) {
863                         __blk_end_request_all(req, -EIO);
864                         continue;
865                 }
866
867                 if (blk_barrier_rq(req)) {
868                         __blk_end_request_all(req, -EOPNOTSUPP);
869                         continue;
870                 }
871
872 #ifdef ENABLE_PASSTHROUGH
873                 if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
874                         blktap_device_forward_request(tap, req);
875                         continue;
876                 }
877 #endif
878
879                 request = blktap_request_allocate(tap);
880                 if (!request) {
881                         tap->stats.st_oo_req++;
882                         goto wait;
883                 }
884
885                 BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
886                       "buffer:%p [%s], pending: %p\n", req, tap->minor,
887                       req->cmd, (unsigned long long)blk_rq_pos(req),
888                       blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer,
889                       rq_data_dir(req) ? "write" : "read", request);
890
891                 spin_unlock_irq(&dev->lock);
892                 down_read(&tap->tap_sem);
893
894                 err = blktap_device_process_request(tap, request, req);
895                 if (!err)
896                         queued++;
897                 else {
898                         blk_end_request_all(req, err);
899                         blktap_request_free(tap, request);
900                 }
901
902                 up_read(&tap->tap_sem);
903                 spin_lock_irq(&dev->lock);
904         }
905
906         if (queued)
907                 blktap_ring_kick_user(tap);
908 }
909
910 /*
911  * dev->lock held on entry
912  */
913 static void
914 blktap_device_do_request(struct request_queue *rq)
915 {
916         struct request *req;
917         struct blktap *tap;
918         struct blktap_device *dev;
919
920         dev = rq->queuedata;
921         if (!dev)
922                 goto fail;
923
924         tap = dev_to_blktap(dev);
925         if (!blktap_active(tap))
926                 goto fail;
927
928         if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
929             test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
930                 blktap_defer(tap);
931                 return;
932         }
933
934         blktap_device_run_queue(tap);
935         return;
936
937 fail:
938         while ((req = blk_peek_request(rq))) {
939                 BTERR("device closed: failing secs %llu - %llu\n",
940                       (unsigned long long)blk_rq_pos(req),
941                       (unsigned long long)blk_rq_pos(req)
942                       + blk_rq_cur_sectors(req));
943                 blk_start_request(req);
944                 __blk_end_request_all(req, -EIO);
945         }
946 }
947
948 void
949 blktap_device_restart(struct blktap *tap)
950 {
951         struct blktap_device *dev;
952
953         dev = &tap->device;
954
955         if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
956                 blktap_defer(tap);
957                 return;
958         }
959
960         if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
961             test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
962                 blktap_defer(tap);
963                 return;
964         }
965
966         spin_lock_irq(&dev->lock);
967
968         /* Re-enable calldowns. */
969         if (dev->gd) {
970                 struct request_queue *rq = dev->gd->queue;
971
972                 if (blk_queue_stopped(rq))
973                         blk_start_queue(rq);
974
975                 /* Kick things off immediately. */
976                 blktap_device_do_request(rq);
977         }
978
979         spin_unlock_irq(&dev->lock);
980 }
981
982 static void
983 blktap_device_configure(struct blktap *tap)
984 {
985         struct request_queue *rq;
986         struct blktap_device *dev = &tap->device;
987
988         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
989                 return;
990
991         dev = &tap->device;
992         rq  = dev->gd->queue;
993
994         spin_lock_irq(&dev->lock);
995
996         set_capacity(dev->gd, tap->params.capacity);
997
998         /* Hard sector size and max sectors impersonate the equiv. hardware. */
999         blk_queue_logical_block_size(rq, tap->params.sector_size);
1000         blk_queue_max_hw_sectors(rq, 512);
1001
1002         /* Each segment in a request is up to an aligned page in size. */
1003         blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
1004         blk_queue_max_segment_size(rq, PAGE_SIZE);
1005
1006         /* Ensure a merged request will fit in a single I/O ring slot. */
1007         blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1008
1009         /* Make sure buffer addresses are sector-aligned. */
1010         blk_queue_dma_alignment(rq, 511);
1011
1012         spin_unlock_irq(&dev->lock);
1013 }
1014
1015 int
1016 blktap_device_resume(struct blktap *tap)
1017 {
1018         int err;
1019
1020         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
1021                 return -ENODEV;
1022
1023         if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
1024                 return 0;
1025
1026         err = blktap_ring_resume(tap);
1027         if (err)
1028                 return err;
1029
1030         /* device size may have changed */
1031         blktap_device_configure(tap);
1032
1033         BTDBG("restarting device\n");
1034         blktap_device_restart(tap);
1035
1036         return 0;
1037 }
1038
1039 int
1040 blktap_device_pause(struct blktap *tap)
1041 {
1042         unsigned long flags;
1043         struct blktap_device *dev = &tap->device;
1044
1045         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
1046                 return -ENODEV;
1047
1048         if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
1049                 return 0;
1050
1051         spin_lock_irqsave(&dev->lock, flags);
1052
1053         blk_stop_queue(dev->gd->queue);
1054         set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
1055
1056         spin_unlock_irqrestore(&dev->lock, flags);
1057
1058         return blktap_ring_pause(tap);
1059 }
1060
1061 int
1062 blktap_device_destroy(struct blktap *tap)
1063 {
1064         struct blktap_device *dev = &tap->device;
1065         struct gendisk *gd = dev->gd;
1066
1067         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1068                 return 0;
1069
1070         BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
1071
1072         if (dev->users)
1073                 return -EBUSY;
1074
1075         spin_lock_irq(&dev->lock);
1076         /* No more blktap_device_do_request(). */
1077         blk_stop_queue(gd->queue);
1078         clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1079         dev->gd = NULL;
1080         spin_unlock_irq(&dev->lock);
1081
1082 #ifdef ENABLE_PASSTHROUGH
1083         if (dev->bdev)
1084                 blktap_device_close_bdev(tap);
1085 #endif
1086
1087         del_gendisk(gd);
1088         blk_cleanup_queue(gd->queue);
1089         put_disk(gd);
1090
1091         wake_up(&tap->wq);
1092
1093         return 0;
1094 }
1095
1096 int
1097 blktap_device_create(struct blktap *tap)
1098 {
1099         int minor, err;
1100         struct gendisk *gd;
1101         struct request_queue *rq;
1102         struct blktap_device *dev;
1103
1104         gd    = NULL;
1105         rq    = NULL;
1106         dev   = &tap->device;
1107         minor = tap->minor;
1108
1109         if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1110                 return -EEXIST;
1111
1112         if (blktap_validate_params(tap, &tap->params))
1113                 return -EINVAL;
1114
1115         BTINFO("minor %d sectors %Lu sector-size %lu\n",
1116                minor, tap->params.capacity, tap->params.sector_size);
1117
1118         err = -ENODEV;
1119
1120         gd = alloc_disk(1);
1121         if (!gd)
1122                 goto error;
1123
1124         if (minor < 26)
1125                 sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
1126         else
1127                 sprintf(gd->disk_name, "tapdev%c%c",
1128                         'a' + ((minor / 26) - 1), 'a' + (minor % 26));
1129
1130         gd->major = blktap_device_major;
1131         gd->first_minor = minor;
1132         gd->fops = &blktap_device_file_operations;
1133         gd->private_data = dev;
1134
1135         spin_lock_init(&dev->lock);
1136         rq = blk_init_queue(blktap_device_do_request, &dev->lock);
1137         if (!rq)
1138                 goto error;
1139
1140 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
1141         elevator_init(rq, "noop");
1142 #else
1143         elevator_init(rq, &elevator_noop);
1144 #endif
1145
1146         gd->queue     = rq;
1147         rq->queuedata = dev;
1148         dev->gd       = gd;
1149
1150         set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1151         blktap_device_configure(tap);
1152
1153         add_disk(gd);
1154
1155         err = 0;
1156         goto out;
1157
1158  error:
1159         if (gd)
1160                 del_gendisk(gd);
1161         if (rq)
1162                 blk_cleanup_queue(rq);
1163
1164  out:
1165         BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
1166         return err;
1167 }
1168
1169 int __init
1170 blktap_device_init(int *maj)
1171 {
1172         int major;
1173
1174         /* Dynamically allocate a major for this device */
1175         major = register_blkdev(0, "tapdev");
1176         if (major < 0) {
1177                 BTERR("Couldn't register blktap device\n");
1178                 return -ENOMEM;
1179         }       
1180
1181         blktap_device_major = *maj = major;
1182         BTINFO("blktap device major %d\n", major);
1183
1184         return 0;
1185 }
1186
1187 void
1188 blktap_device_free(void)
1189 {
1190         if (blktap_device_major)
1191                 unregister_blkdev(blktap_device_major, "tapdev");
1192 }