Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / drivers / xen / blkfront / blkfront.c
1 /******************************************************************************
2  * blkfront.c
3  * 
4  * XenLinux virtual block-device driver.
5  * 
6  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8  * Copyright (c) 2004, Christian Limpach
9  * Copyright (c) 2004, Andrew Warfield
10  * Copyright (c) 2005, Christopher Clark
11  * Copyright (c) 2005, XenSource Ltd
12  * 
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/version.h>
39 #include "block.h"
40 #include <linux/cdrom.h>
41 #include <linux/sched.h>
42 #include <linux/interrupt.h>
43 #include <linux/scatterlist.h>
44 #include <scsi/scsi.h>
45 #include <xen/evtchn.h>
46 #include <xen/xenbus.h>
47 #include <xen/interface/grant_table.h>
48 #include <xen/interface/io/protocols.h>
49 #include <xen/gnttab.h>
50 #include <asm/hypervisor.h>
51 #include <asm/maddr.h>
52
53 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
54 #include <xen/platform-compat.h>
55 #endif
56
57 #define BLKIF_STATE_DISCONNECTED 0
58 #define BLKIF_STATE_CONNECTED    1
59 #define BLKIF_STATE_SUSPENDED    2
60
61 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
62     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
63
64 static void connect(struct blkfront_info *);
65 static void blkfront_closing(struct blkfront_info *);
66 static int blkfront_remove(struct xenbus_device *);
67 static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
68 static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
69
70 static void kick_pending_request_queues(struct blkfront_info *);
71
72 static irqreturn_t blkif_int(int irq, void *dev_id);
73 static void blkif_restart_queue(struct work_struct *arg);
74 static int blkif_recover(struct blkfront_info *);
75 static void blkif_completion(struct blk_shadow *);
76 static void blkif_free(struct blkfront_info *, int);
77
78
79 /**
80  * Entry point to this code when a new device is created.  Allocate the basic
81  * structures and the ring buffer for communication with the backend, and
82  * inform the backend of the appropriate details for those.  Switch to
83  * Initialised state.
84  */
85 static int blkfront_probe(struct xenbus_device *dev,
86                           const struct xenbus_device_id *id)
87 {
88         int err, vdevice, i;
89         struct blkfront_info *info;
90
91 #ifndef CONFIG_XEN /* For HVM guests, do not take over CDROM devices. */
92         char *type;
93
94         type = xenbus_read(XBT_NIL, dev->nodename, "device-type", NULL);
95         if (IS_ERR(type)) {
96                 xenbus_dev_fatal(dev, PTR_ERR(type), "reading dev type");
97                 return PTR_ERR(type);
98         }
99         if (!strncmp(type, "cdrom", 5)) {
100                 /*
101                  * We are handed a cdrom device in a hvm guest; let the
102                  * native cdrom driver handle this device.
103                  */
104                 kfree(type);
105                 pr_notice("blkfront: ignoring CDROM %s\n", dev->nodename);
106                 return -ENXIO;
107         }
108         kfree(type);
109 #endif
110
111         /* FIXME: Use dynamic device id if this is not set. */
112         err = xenbus_scanf(XBT_NIL, dev->nodename,
113                            "virtual-device", "%i", &vdevice);
114         if (err != 1) {
115                 /* go looking in the extended area instead */
116                 err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
117                         "%i", &vdevice);
118                 if (err != 1) {
119                         xenbus_dev_fatal(dev, err, "reading virtual-device");
120                         return err;
121                 }
122         }
123
124         info = kzalloc(sizeof(*info), GFP_KERNEL);
125         if (!info) {
126                 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
127                 return -ENOMEM;
128         }
129
130         spin_lock_init(&info->io_lock);
131         mutex_init(&info->mutex);
132         info->xbdev = dev;
133         info->vdevice = vdevice;
134         info->connected = BLKIF_STATE_DISCONNECTED;
135         INIT_WORK(&info->work, blkif_restart_queue);
136
137         for (i = 0; i < BLK_RING_SIZE; i++)
138                 info->shadow[i].req.id = i+1;
139         info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
140
141         /* Front end dir is a number, which is used as the id. */
142         info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
143         dev_set_drvdata(&dev->dev, info);
144
145         err = talk_to_backend(dev, info);
146         if (err) {
147                 kfree(info);
148                 dev_set_drvdata(&dev->dev, NULL);
149                 return err;
150         }
151
152         return 0;
153 }
154
155
156 /**
157  * We are reconnecting to the backend, due to a suspend/resume, or a backend
158  * driver restart.  We tear down our blkif structure and recreate it, but
159  * leave the device-layer structures intact so that this is transparent to the
160  * rest of the kernel.
161  */
162 static int blkfront_resume(struct xenbus_device *dev)
163 {
164         struct blkfront_info *info = dev_get_drvdata(&dev->dev);
165         int err;
166
167         DPRINTK("blkfront_resume: %s\n", dev->nodename);
168
169         blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
170
171         err = talk_to_backend(dev, info);
172         if (info->connected == BLKIF_STATE_SUSPENDED && !err)
173                 err = blkif_recover(info);
174
175         return err;
176 }
177
178
179 /* Common code used when first setting up, and when resuming. */
180 static int talk_to_backend(struct xenbus_device *dev,
181                            struct blkfront_info *info)
182 {
183         const char *message = NULL;
184         struct xenbus_transaction xbt;
185         int err;
186
187         /* Create shared ring, alloc event channel. */
188         err = setup_blkring(dev, info);
189         if (err)
190                 goto out;
191
192 again:
193         err = xenbus_transaction_start(&xbt);
194         if (err) {
195                 xenbus_dev_fatal(dev, err, "starting transaction");
196                 goto destroy_blkring;
197         }
198
199         err = xenbus_printf(xbt, dev->nodename,
200                             "ring-ref","%u", info->ring_ref);
201         if (err) {
202                 message = "writing ring-ref";
203                 goto abort_transaction;
204         }
205         err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
206                             irq_to_evtchn_port(info->irq));
207         if (err) {
208                 message = "writing event-channel";
209                 goto abort_transaction;
210         }
211         err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
212                             XEN_IO_PROTO_ABI_NATIVE);
213         if (err) {
214                 message = "writing protocol";
215                 goto abort_transaction;
216         }
217
218         err = xenbus_transaction_end(xbt, 0);
219         if (err) {
220                 if (err == -EAGAIN)
221                         goto again;
222                 xenbus_dev_fatal(dev, err, "completing transaction");
223                 goto destroy_blkring;
224         }
225
226         xenbus_switch_state(dev, XenbusStateInitialised);
227
228         return 0;
229
230  abort_transaction:
231         xenbus_transaction_end(xbt, 1);
232         if (message)
233                 xenbus_dev_fatal(dev, err, "%s", message);
234  destroy_blkring:
235         blkif_free(info, 0);
236  out:
237         return err;
238 }
239
240
241 static int setup_blkring(struct xenbus_device *dev,
242                          struct blkfront_info *info)
243 {
244         blkif_sring_t *sring;
245         int err;
246
247         info->ring_ref = GRANT_INVALID_REF;
248
249         sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH);
250         if (!sring) {
251                 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
252                 return -ENOMEM;
253         }
254         SHARED_RING_INIT(sring);
255         FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
256
257         sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
258
259         err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
260         if (err < 0) {
261                 free_page((unsigned long)sring);
262                 info->ring.sring = NULL;
263                 goto fail;
264         }
265         info->ring_ref = err;
266
267         err = bind_listening_port_to_irqhandler(
268                 dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
269         if (err <= 0) {
270                 xenbus_dev_fatal(dev, err,
271                                  "bind_listening_port_to_irqhandler");
272                 goto fail;
273         }
274         info->irq = err;
275
276         return 0;
277 fail:
278         blkif_free(info, 0);
279         return err;
280 }
281
282
283 /**
284  * Callback received when the backend's state changes.
285  */
286 static void backend_changed(struct xenbus_device *dev,
287                             enum xenbus_state backend_state)
288 {
289         struct blkfront_info *info = dev_get_drvdata(&dev->dev);
290         struct block_device *bd;
291
292         DPRINTK("blkfront:backend_changed.\n");
293
294         switch (backend_state) {
295         case XenbusStateInitialising:
296         case XenbusStateInitWait:
297         case XenbusStateInitialised:
298         case XenbusStateReconfiguring:
299         case XenbusStateReconfigured:
300         case XenbusStateUnknown:
301         case XenbusStateClosed:
302                 break;
303
304         case XenbusStateConnected:
305                 connect(info);
306                 break;
307
308         case XenbusStateClosing:
309                 mutex_lock(&info->mutex);
310                 if (dev->state == XenbusStateClosing) {
311                         mutex_unlock(&info->mutex);
312                         break;
313                 }
314
315                 bd = info->gd ? bdget_disk(info->gd, 0) : NULL;
316
317                 mutex_unlock(&info->mutex);
318
319                 if (bd == NULL) {
320                         xenbus_frontend_closed(dev);
321                         break;
322                 }
323
324 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
325                 down(&bd->bd_sem);
326 #else
327                 mutex_lock(&bd->bd_mutex);
328 #endif
329                 if (bd->bd_openers) {
330                         xenbus_dev_error(dev, -EBUSY,
331                                          "Device in use; refusing to close");
332                         xenbus_switch_state(dev, XenbusStateClosing);
333                 } else
334                         blkfront_closing(info);
335 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
336                 up(&bd->bd_sem);
337 #else
338                 mutex_unlock(&bd->bd_mutex);
339 #endif
340                 bdput(bd);
341                 break;
342         }
343 }
344
345
346 /* ** Connection ** */
347
348 static void blkfront_setup_discard(struct blkfront_info *info)
349 {
350         int err;
351         char *type;
352         unsigned int discard_granularity;
353         unsigned int discard_alignment;
354         int discard_secure;
355
356         type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
357         if (IS_ERR(type))
358                 return;
359
360         info->feature_secdiscard = 0;
361         if (strncmp(type, "phy", 3) == 0) {
362                 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
363                         "discard-granularity", "%u", &discard_granularity,
364                         "discard-alignment", "%u", &discard_alignment,
365                         NULL);
366                 if (!err) {
367                         info->feature_discard = 1;
368                         info->discard_granularity = discard_granularity;
369                         info->discard_alignment = discard_alignment;
370                 }
371                 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
372                             "discard-secure", "%d", &discard_secure);
373                 if (err == 1)
374                         info->feature_secdiscard = discard_secure;
375         } else if (strncmp(type, "file", 4) == 0)
376                 info->feature_discard = 1;
377
378         kfree(type);
379 }
380
381 /*
382  * Invoked when the backend is finally 'ready' (and has told produced
383  * the details about the physical device - #sectors, size, etc).
384  */
385 static void connect(struct blkfront_info *info)
386 {
387         unsigned long long sectors;
388         unsigned long sector_size;
389         unsigned int binfo;
390         int err, barrier, flush, discard;
391
392         switch (info->connected) {
393         case BLKIF_STATE_CONNECTED:
394                 /*
395                  * Potentially, the back-end may be signalling
396                  * a capacity change; update the capacity.
397                  */
398                 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
399                                    "sectors", "%Lu", &sectors);
400                 if (err != 1)
401                         return;
402                 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
403                                    "sector-size", "%lu", &sector_size);
404                 if (err != 1)
405                         sector_size = 0;
406                 if (sector_size)
407                         blk_queue_logical_block_size(info->gd->queue,
408                                                      sector_size);
409                 pr_info("Setting capacity to %Lu\n", sectors);
410                 set_capacity(info->gd, sectors);
411                 revalidate_disk(info->gd);
412
413                 /* fall through */
414         case BLKIF_STATE_SUSPENDED:
415                 return;
416         }
417
418         DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
419
420         err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
421                             "sectors", "%Lu", &sectors,
422                             "info", "%u", &binfo,
423                             "sector-size", "%lu", &sector_size,
424                             NULL);
425         if (err) {
426                 xenbus_dev_fatal(info->xbdev, err,
427                                  "reading backend fields at %s",
428                                  info->xbdev->otherend);
429                 return;
430         }
431
432         info->feature_flush = 0;
433         info->flush_op = 0;
434
435         err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
436                            "feature-barrier", "%d", &barrier);
437         /*
438          * If there's no "feature-barrier" defined, then it means
439          * we're dealing with a very old backend which writes
440          * synchronously; nothing to do.
441          *
442          * If there are barriers, then we use flush.
443          */
444 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
445         if (err > 0 && barrier) {
446                 info->feature_flush = REQ_FLUSH | REQ_FUA;
447                 info->flush_op = BLKIF_OP_WRITE_BARRIER;
448         }
449         /*
450          * And if there is "feature-flush-cache" use that above
451          * barriers.
452          */
453         err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
454                            "feature-flush-cache", "%d", &flush);
455         if (err > 0 && flush) {
456                 info->feature_flush = REQ_FLUSH;
457                 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
458         }
459 #else
460         if (err <= 0)
461                 info->feature_flush = QUEUE_ORDERED_DRAIN;
462         else if (barrier)
463                 info->feature_flush = QUEUE_ORDERED_TAG;
464         else
465                 info->feature_flush = QUEUE_ORDERED_NONE;
466 #endif
467
468         err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
469                            "feature-discard", "%d", &discard);
470
471         if (err > 0 && discard)
472                 blkfront_setup_discard(info);
473
474         err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
475         if (err) {
476                 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
477                                  info->xbdev->otherend);
478                 return;
479         }
480
481         err = xlvbd_sysfs_addif(info);
482         if (err) {
483                 xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s",
484                                  info->xbdev->otherend);
485                 return;
486         }
487
488         (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
489
490         /* Kick pending requests. */
491         spin_lock_irq(&info->io_lock);
492         info->connected = BLKIF_STATE_CONNECTED;
493         kick_pending_request_queues(info);
494         spin_unlock_irq(&info->io_lock);
495
496         add_disk(info->gd);
497
498         info->is_ready = 1;
499
500         register_vcd(info);
501 }
502
503 /**
504  * Handle the change of state of the backend to Closing.  We must delete our
505  * device-layer structures now, to ensure that writes are flushed through to
506  * the backend.  Once is this done, we can switch to Closed in
507  * acknowledgement.
508  */
509 static void blkfront_closing(struct blkfront_info *info)
510 {
511         unsigned long flags;
512
513         DPRINTK("blkfront_closing: %d removed\n", info->vdevice);
514
515         if (info->rq == NULL)
516                 goto out;
517
518         spin_lock_irqsave(&info->io_lock, flags);
519         /* No more blkif_request(). */
520         blk_stop_queue(info->rq);
521         /* No more gnttab callback work. */
522         gnttab_cancel_free_callback(&info->callback);
523         spin_unlock_irqrestore(&info->io_lock, flags);
524
525         /* Flush gnttab callback work. Must be done with no locks held. */
526         flush_work_sync(&info->work);
527
528         xlvbd_sysfs_delif(info);
529
530         unregister_vcd(info);
531
532         xlvbd_del(info);
533
534  out:
535         if (info->xbdev)
536                 xenbus_frontend_closed(info->xbdev);
537 }
538
539
540 static int blkfront_remove(struct xenbus_device *dev)
541 {
542         struct blkfront_info *info = dev_get_drvdata(&dev->dev);
543         struct block_device *bd;
544         struct gendisk *disk;
545
546         DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
547
548         blkif_free(info, 0);
549
550         mutex_lock(&info->mutex);
551
552         disk = info->gd;
553         bd = disk ? bdget_disk(disk, 0) : NULL;
554
555         info->xbdev = NULL;
556         mutex_unlock(&info->mutex);
557
558         if (!bd) {
559                 kfree(info);
560                 return 0;
561         }
562
563         /*
564          * The xbdev was removed before we reached the Closed
565          * state. See if it's safe to remove the disk. If the bdev
566          * isn't closed yet, we let release take care of it.
567          */
568 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
569         down(&bd->bd_sem);
570 #else
571         mutex_lock(&bd->bd_mutex);
572 #endif
573         info = disk->private_data;
574
575         dev_warn(disk_to_dev(disk),
576                  "%s was hot-unplugged, %d stale handles\n",
577                  dev->nodename, bd->bd_openers);
578
579         if (info && !bd->bd_openers) {
580                 blkfront_closing(info);
581                 disk->private_data = NULL;
582                 kfree(info);
583         }
584
585 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
586         up(&bd->bd_sem);
587 #else
588         mutex_unlock(&bd->bd_mutex);
589 #endif
590         bdput(bd);
591
592         return 0;
593 }
594
595
596 static inline int GET_ID_FROM_FREELIST(
597         struct blkfront_info *info)
598 {
599         unsigned long free = info->shadow_free;
600         BUG_ON(free >= BLK_RING_SIZE);
601         info->shadow_free = info->shadow[free].req.id;
602         info->shadow[free].req.id = 0x0fffffee; /* debug */
603         return free;
604 }
605
606 static inline void ADD_ID_TO_FREELIST(
607         struct blkfront_info *info, unsigned long id)
608 {
609         info->shadow[id].req.id  = info->shadow_free;
610         info->shadow[id].request = NULL;
611         info->shadow_free = id;
612 }
613
614 static inline void flush_requests(struct blkfront_info *info)
615 {
616         int notify;
617
618         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
619
620         if (notify)
621                 notify_remote_via_irq(info->irq);
622 }
623
624 static void kick_pending_request_queues(struct blkfront_info *info)
625 {
626         if (!RING_FULL(&info->ring)) {
627                 /* Re-enable calldowns. */
628                 blk_start_queue(info->rq);
629                 /* Kick things off immediately. */
630                 do_blkif_request(info->rq);
631         }
632 }
633
634 static void blkif_restart_queue(struct work_struct *arg)
635 {
636         struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
637         spin_lock_irq(&info->io_lock);
638         if (info->connected == BLKIF_STATE_CONNECTED)
639                 kick_pending_request_queues(info);
640         spin_unlock_irq(&info->io_lock);
641 }
642
643 static void blkif_restart_queue_callback(void *arg)
644 {
645         struct blkfront_info *info = (struct blkfront_info *)arg;
646         schedule_work(&info->work);
647 }
648
649 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
650 int blkif_open(struct inode *inode, struct file *filep)
651 {
652         struct block_device *bd = inode->i_bdev;
653 #else
654 int blkif_open(struct block_device *bd, fmode_t mode)
655 {
656 #endif
657         struct blkfront_info *info = bd->bd_disk->private_data;
658         int err = 0;
659
660         if (!info)
661                 /* xbdev gone */
662                 err = -ERESTARTSYS;
663         else {
664                 mutex_lock(&info->mutex);
665
666                 if (!info->gd)
667                         /* xbdev is closed */
668                         err = -ERESTARTSYS;
669
670                 mutex_unlock(&info->mutex);
671         }
672
673         return err;
674 }
675
676
677 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
678 int blkif_release(struct inode *inode, struct file *filep)
679 {
680         struct gendisk *disk = inode->i_bdev->bd_disk;
681 #else
682 int blkif_release(struct gendisk *disk, fmode_t mode)
683 {
684 #endif
685         struct blkfront_info *info = disk->private_data;
686         struct xenbus_device *xbdev;
687         struct block_device *bd = bdget_disk(disk, 0);
688
689         bdput(bd);
690         if (bd->bd_openers)
691                 return 0;
692
693         /*
694          * Check if we have been instructed to close. We will have
695          * deferred this request, because the bdev was still open.
696          */
697         mutex_lock(&info->mutex);
698         xbdev = info->xbdev;
699
700         if (xbdev && xbdev->state == XenbusStateClosing) {
701                 /* pending switch to state closed */
702                 dev_info(disk_to_dev(disk), "releasing disk\n");
703                 blkfront_closing(info);
704         }
705
706         mutex_unlock(&info->mutex);
707
708         if (!xbdev) {
709                 /* sudden device removal */
710                 dev_info(disk_to_dev(disk), "releasing disk\n");
711                 blkfront_closing(info);
712                 disk->private_data = NULL;
713                 kfree(info);
714         }
715
716         return 0;
717 }
718
719
720 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
721 int blkif_ioctl(struct inode *inode, struct file *filep,
722                 unsigned command, unsigned long argument)
723 {
724         struct block_device *bd = inode->i_bdev;
725 #else
726 int blkif_ioctl(struct block_device *bd, fmode_t mode,
727                 unsigned command, unsigned long argument)
728 {
729 #endif
730         struct blkfront_info *info = bd->bd_disk->private_data;
731         int i;
732
733         DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
734                       command, (long)argument, inode->i_rdev);
735
736         switch (command) {
737 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
738         case HDIO_GETGEO: {
739                 struct hd_geometry geo;
740                 int ret;
741
742                 if (!argument)
743                         return -EINVAL;
744
745                 geo.start = get_start_sect(bd);
746                 ret = blkif_getgeo(bd, &geo);
747                 if (ret)
748                         return ret;
749
750                 if (copy_to_user((struct hd_geometry __user *)argument, &geo,
751                                  sizeof(geo)))
752                         return -EFAULT;
753
754                 return 0;
755         }
756 #endif
757         case CDROMMULTISESSION:
758                 DPRINTK("FIXME: support multisession CDs later\n");
759                 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
760                         if (put_user(0, (char __user *)(argument + i)))
761                                 return -EFAULT;
762                 return 0;
763
764         case CDROM_GET_CAPABILITY:
765                 if (info->gd && (info->gd->flags & GENHD_FL_CD))
766                         return 0;
767                 return -EINVAL;
768
769         default:
770                 if (info->mi && info->gd && info->rq) {
771                         switch (info->mi->major) {
772                         case SCSI_DISK0_MAJOR:
773                         case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
774                         case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
775                         case SCSI_CDROM_MAJOR:
776 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
777                                 return scsi_cmd_ioctl(filep, info->gd, command,
778                                                       (void __user *)argument);
779 #elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
780                                 return scsi_cmd_ioctl(filep, info->rq,
781                                                       info->gd, command,
782                                                       (void __user *)argument);
783 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
784                                 return scsi_cmd_ioctl(info->rq, info->gd,
785                                                       mode, command,
786                                                       (void __user *)argument);
787 #else
788                                 return scsi_cmd_blk_ioctl(bd, mode, command,
789                                                           (void __user *)argument);
790 #endif
791                         }
792                 }
793
794                 return -EINVAL; /* same return as native Linux */
795         }
796
797         return 0;
798 }
799
800
801 int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
802 {
803         /* We don't have real geometry info, but let's at least return
804            values consistent with the size of the device */
805         sector_t nsect = get_capacity(bd->bd_disk);
806         sector_t cylinders = nsect;
807
808         hg->heads = 0xff;
809         hg->sectors = 0x3f;
810         sector_div(cylinders, hg->heads * hg->sectors);
811         hg->cylinders = cylinders;
812         if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
813                 hg->cylinders = 0xffff;
814         return 0;
815 }
816
817
818 /*
819  * Generate a Xen blkfront IO request from a blk layer request.  Reads
820  * and writes are handled as expected.
821  *
822  * @req: a request struct
823  */
824 static int blkif_queue_request(struct request *req)
825 {
826         struct blkfront_info *info = req->rq_disk->private_data;
827         unsigned long buffer_mfn;
828         blkif_request_t *ring_req;
829         unsigned long id;
830         unsigned int fsect, lsect;
831         int i, ref;
832         grant_ref_t gref_head;
833         struct scatterlist *sg;
834
835         if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
836                 return 1;
837
838         if (gnttab_alloc_grant_references(
839                 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
840                 gnttab_request_free_callback(
841                         &info->callback,
842                         blkif_restart_queue_callback,
843                         info,
844                         BLKIF_MAX_SEGMENTS_PER_REQUEST);
845                 return 1;
846         }
847
848         /* Fill out a communications ring structure. */
849         ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
850         id = GET_ID_FROM_FREELIST(info);
851         info->shadow[id].request = req;
852
853         ring_req->id = id;
854         ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
855         ring_req->handle = info->handle;
856
857         ring_req->operation = rq_data_dir(req) ?
858                 BLKIF_OP_WRITE : BLKIF_OP_READ;
859 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
860         if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
861 #else
862         if (req->cmd_flags & REQ_HARDBARRIER)
863 #endif
864                 ring_req->operation = info->flush_op;
865         if (req->cmd_type == REQ_TYPE_BLOCK_PC)
866                 ring_req->operation = BLKIF_OP_PACKET;
867
868         if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
869                 struct blkif_request_discard *discard = (void *)ring_req;
870
871                 /* id, sector_number and handle are set above. */
872                 discard->operation = BLKIF_OP_DISCARD;
873                 discard->flag = 0;
874                 discard->nr_sectors = blk_rq_sectors(req);
875                 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
876                         discard->flag = BLKIF_DISCARD_SECURE;
877         } else {
878                 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
879                 BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
880                 for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
881                         buffer_mfn = page_to_phys(sg_page(sg)) >> PAGE_SHIFT;
882                         fsect = sg->offset >> 9;
883                         lsect = fsect + (sg->length >> 9) - 1;
884                         /* install a grant reference. */
885                         ref = gnttab_claim_grant_reference(&gref_head);
886                         BUG_ON(ref == -ENOSPC);
887
888                         gnttab_grant_foreign_access_ref(
889                                 ref,
890                                 info->xbdev->otherend_id,
891                                 buffer_mfn,
892                                 rq_data_dir(req) ? GTF_readonly : 0 );
893
894                         info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
895                         ring_req->seg[i] =
896                                 (struct blkif_request_segment) {
897                                         .gref       = ref,
898                                         .first_sect = fsect,
899                                         .last_sect  = lsect };
900                 }
901         }
902
903         info->ring.req_prod_pvt++;
904
905         /* Keep a private copy so we can reissue requests when recovering. */
906         info->shadow[id].req = *ring_req;
907
908         gnttab_free_grant_references(gref_head);
909
910         return 0;
911 }
912
913 /*
914  * do_blkif_request
915  *  read a block; request is in a request queue
916  */
917 void do_blkif_request(struct request_queue *rq)
918 {
919         struct blkfront_info *info = NULL;
920         struct request *req;
921         int queued;
922
923         DPRINTK("Entered do_blkif_request\n");
924
925         queued = 0;
926
927         while ((req = blk_peek_request(rq)) != NULL) {
928                 info = req->rq_disk->private_data;
929
930                 if (RING_FULL(&info->ring))
931                         goto wait;
932
933                 blk_start_request(req);
934
935                 if ((req->cmd_type != REQ_TYPE_FS &&
936                      (req->cmd_type != REQ_TYPE_BLOCK_PC || req->cmd_len)) ||
937                     ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
938                      !info->flush_op)) {
939                         req->errors = (DID_ERROR << 16) |
940                                       (DRIVER_INVALID << 24);
941                         __blk_end_request_all(req, -EIO);
942                         continue;
943                 }
944
945                 DPRINTK("do_blk_req %p: cmd %p, sec %llx, "
946                         "(%u/%u) buffer:%p [%s]\n",
947                         req, req->cmd, (long long)blk_rq_pos(req),
948                         blk_rq_cur_sectors(req), blk_rq_sectors(req),
949                         req->buffer, rq_data_dir(req) ? "write" : "read");
950
951                 if (blkif_queue_request(req)) {
952                         blk_requeue_request(rq, req);
953                 wait:
954                         /* Avoid pointless unplugs. */
955                         blk_stop_queue(rq);
956                         break;
957                 }
958
959                 queued++;
960         }
961
962         if (queued != 0)
963                 flush_requests(info);
964 }
965
966
967 static irqreturn_t blkif_int(int irq, void *dev_id)
968 {
969         struct request *req;
970         blkif_response_t *bret;
971         RING_IDX i, rp;
972         unsigned long flags;
973         struct blkfront_info *info = (struct blkfront_info *)dev_id;
974
975         spin_lock_irqsave(&info->io_lock, flags);
976
977         if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
978                 spin_unlock_irqrestore(&info->io_lock, flags);
979                 return IRQ_HANDLED;
980         }
981
982  again:
983         rp = info->ring.sring->rsp_prod;
984         rmb(); /* Ensure we see queued responses up to 'rp'. */
985
986         for (i = info->ring.rsp_cons; i != rp; i++) {
987                 unsigned long id;
988                 int ret;
989
990                 bret = RING_GET_RESPONSE(&info->ring, i);
991                 id   = bret->id;
992                 req  = info->shadow[id].request;
993
994                 blkif_completion(&info->shadow[id]);
995
996                 ADD_ID_TO_FREELIST(info, id);
997
998                 ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
999                 switch (bret->operation) {
1000                         const char *what;
1001
1002                 case BLKIF_OP_FLUSH_DISKCACHE:
1003                 case BLKIF_OP_WRITE_BARRIER:
1004                         what = bret->operation == BLKIF_OP_WRITE_BARRIER ?
1005                                "write barrier" : "flush disk cache";
1006                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1007                                 pr_warn("blkfront: %s: %s op failed\n",
1008                                         what, info->gd->disk_name);
1009                                 ret = -EOPNOTSUPP;
1010                         }
1011                         if (unlikely(bret->status == BLKIF_RSP_ERROR &&
1012                                      info->shadow[id].req.nr_segments == 0)) {
1013                                 pr_warn("blkfront: %s: empty %s op failed\n",
1014                                         what, info->gd->disk_name);
1015                                 ret = -EOPNOTSUPP;
1016                         }
1017                         if (unlikely(ret)) {
1018                                 if (ret == -EOPNOTSUPP)
1019                                         ret = 0;
1020 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
1021                                 info->feature_flush = 0;
1022 #else
1023                                 info->feature_flush = QUEUE_ORDERED_NONE;
1024 #endif
1025                                 xlvbd_flush(info);
1026                         }
1027                         /* fall through */
1028                 case BLKIF_OP_READ:
1029                 case BLKIF_OP_WRITE:
1030                 case BLKIF_OP_PACKET:
1031                         if (unlikely(bret->status != BLKIF_RSP_OKAY))
1032                                 DPRINTK("Bad return from blkdev data "
1033                                         "request: %x\n", bret->status);
1034
1035                         __blk_end_request_all(req, ret);
1036                         break;
1037                 case BLKIF_OP_DISCARD:
1038                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1039                                 struct request_queue *rq = info->rq;
1040
1041                                 pr_warn("blkfront: %s: discard op failed\n",
1042                                         info->gd->disk_name);
1043                                 ret = -EOPNOTSUPP;
1044                                 info->feature_discard = 0;
1045                                 info->feature_secdiscard = 0;
1046                                 queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1047                                 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
1048                         }
1049                         __blk_end_request_all(req, ret);
1050                         break;
1051                 default:
1052                         BUG();
1053                 }
1054         }
1055
1056         info->ring.rsp_cons = i;
1057
1058         if (i != info->ring.req_prod_pvt) {
1059                 int more_to_do;
1060                 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1061                 if (more_to_do)
1062                         goto again;
1063         } else
1064                 info->ring.sring->rsp_event = i + 1;
1065
1066         kick_pending_request_queues(info);
1067
1068         spin_unlock_irqrestore(&info->io_lock, flags);
1069
1070         return IRQ_HANDLED;
1071 }
1072
1073 static void blkif_free(struct blkfront_info *info, int suspend)
1074 {
1075         /* Prevent new requests being issued until we fix things up. */
1076         spin_lock_irq(&info->io_lock);
1077         info->connected = suspend ?
1078                 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1079         /* No more blkif_request(). */
1080         if (info->rq)
1081                 blk_stop_queue(info->rq);
1082         /* No more gnttab callback work. */
1083         gnttab_cancel_free_callback(&info->callback);
1084         spin_unlock_irq(&info->io_lock);
1085
1086         /* Flush gnttab callback work. Must be done with no locks held. */
1087         flush_work_sync(&info->work);
1088
1089         /* Free resources associated with old device channel. */
1090         if (info->ring_ref != GRANT_INVALID_REF) {
1091                 gnttab_end_foreign_access(info->ring_ref, 
1092                                           (unsigned long)info->ring.sring);
1093                 info->ring_ref = GRANT_INVALID_REF;
1094                 info->ring.sring = NULL;
1095         }
1096         if (info->irq)
1097                 unbind_from_irqhandler(info->irq, info);
1098         info->irq = 0;
1099 }
1100
1101 static void blkif_completion(struct blk_shadow *s)
1102 {
1103         int i;
1104
1105         if (s->req.operation == BLKIF_OP_DISCARD)
1106                 return;
1107         for (i = 0; i < s->req.nr_segments; i++)
1108                 gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
1109 }
1110
1111 static int blkif_recover(struct blkfront_info *info)
1112 {
1113         int i;
1114         blkif_request_t *req;
1115         struct blk_shadow *copy;
1116         int j;
1117
1118         /* Stage 1: Make a safe copy of the shadow state. */
1119         copy = kmemdup(info->shadow, sizeof(info->shadow),
1120                        GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
1121         if (!copy)
1122                 return -ENOMEM;
1123
1124         /* Stage 2: Set up free list. */
1125         memset(&info->shadow, 0, sizeof(info->shadow));
1126         for (i = 0; i < BLK_RING_SIZE; i++)
1127                 info->shadow[i].req.id = i+1;
1128         info->shadow_free = info->ring.req_prod_pvt;
1129         info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1130
1131         /* Stage 3: Find pending requests and requeue them. */
1132         for (i = 0; i < BLK_RING_SIZE; i++) {
1133                 /* Not in use? */
1134                 if (!copy[i].request)
1135                         continue;
1136
1137                 /* Grab a request slot and copy shadow state into it. */
1138                 req = RING_GET_REQUEST(
1139                         &info->ring, info->ring.req_prod_pvt);
1140                 *req = copy[i].req;
1141
1142                 /* We get a new request id, and must reset the shadow state. */
1143                 req->id = GET_ID_FROM_FREELIST(info);
1144                 memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
1145
1146                 /* Rewrite any grant references invalidated by susp/resume. */
1147                 for (j = 0; j < req->nr_segments; j++)
1148                         gnttab_grant_foreign_access_ref(
1149                                 req->seg[j].gref,
1150                                 info->xbdev->otherend_id,
1151                                 pfn_to_mfn(info->shadow[req->id].frame[j]),
1152                                 rq_data_dir(info->shadow[req->id].request) ?
1153                                 GTF_readonly : 0);
1154                 info->shadow[req->id].req = *req;
1155
1156                 info->ring.req_prod_pvt++;
1157         }
1158
1159         kfree(copy);
1160
1161         (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
1162
1163         spin_lock_irq(&info->io_lock);
1164
1165         /* Now safe for us to use the shared ring */
1166         info->connected = BLKIF_STATE_CONNECTED;
1167
1168         /* Send off requeued requests */
1169         flush_requests(info);
1170
1171         /* Kick any other new requests queued since we resumed */
1172         kick_pending_request_queues(info);
1173
1174         spin_unlock_irq(&info->io_lock);
1175
1176         return 0;
1177 }
1178
1179 int blkfront_is_ready(struct xenbus_device *dev)
1180 {
1181         struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1182
1183         return info->is_ready && info->xbdev;
1184 }
1185
1186
1187 /* ** Driver Registration ** */
1188
1189
1190 static const struct xenbus_device_id blkfront_ids[] = {
1191         { "vbd" },
1192         { "" }
1193 };
1194 MODULE_ALIAS("xen:vbd");
1195
1196 static DEFINE_XENBUS_DRIVER(blkfront, ,
1197         .probe = blkfront_probe,
1198         .remove = blkfront_remove,
1199         .resume = blkfront_resume,
1200         .otherend_changed = backend_changed,
1201         .is_ready = blkfront_is_ready,
1202 );
1203
1204
1205 static int __init xlblk_init(void)
1206 {
1207         if (!is_running_on_xen())
1208                 return -ENODEV;
1209
1210         return xenbus_register_frontend(&blkfront_driver);
1211 }
1212 module_init(xlblk_init);
1213
1214
1215 static void __exit xlblk_exit(void)
1216 {
1217         xenbus_unregister_driver(&blkfront_driver);
1218         xlbd_release_major_info();
1219 }
1220 module_exit(xlblk_exit);
1221
1222 MODULE_LICENSE("Dual BSD/GPL");