UBUNTU: SAUCE: Improve Amazon EBS performance for EC2
authorJohn Johansen <john.johansen@canonical.com>
Wed, 15 Sep 2010 17:28:20 +0000 (10:28 -0700)
committerLeann Ogasawara <leann.ogasawara@canonical.com>
Mon, 28 Mar 2011 13:49:18 +0000 (06:49 -0700)
OriginalAuthor: Amazona from Ben Howard <behoward@amazon.com>
BugLink: http://bugs.launchpad.net/bugs/634316

The pv-ops kernel suffers from poor performance when using Amazon's
Elastic block storage (EBS).  This patch from Amazon improves pv-ops
kernel performance, and has not exhibited any regressions.

Signed-off-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Leann Ogasawara <leann.ogasawara@canonical.com>

drivers/block/xen-blkfront.c
include/xen/interface/io/blkif.h

index 61cc1ff..fcea517 100644 (file)
@@ -72,7 +72,8 @@ struct blk_shadow {
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define BLK_MAX_RING_AREA_SIZE (BLKIF_MAX_NUM_RING_PAGES * PAGE_SIZE)
+#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, BLK_MAX_RING_AREA_SIZE)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -87,14 +88,15 @@ struct blkfront_info
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-       int ring_ref;
+       int num_ring_pages;
+       int ring_ref[BLKIF_MAX_NUM_RING_PAGES];
        struct blkif_front_ring ring;
        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int evtchn, irq;
        struct request_queue *rq;
        struct work_struct work;
        struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_RING_SIZE];
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
        unsigned long shadow_free;
        unsigned int feature_flush;
        int is_ready;
@@ -126,7 +128,8 @@ static DEFINE_SPINLOCK(minor_lock);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
        unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE);
+       int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE);
+       BUG_ON(free >= ring_size);
        info->shadow_free = info->shadow[free].req.id;
        info->shadow[free].req.id = 0x0fffffee; /* debug */
        return free;
@@ -584,6 +587,8 @@ static void blkif_restart_queue(struct work_struct *work)
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
+       int i;
+
        /* Prevent new requests being issued until we fix things up. */
        spin_lock_irq(&blkif_io_lock);
        info->connected = suspend ?
@@ -599,10 +604,17 @@ static void blkif_free(struct blkfront_info *info, int suspend)
        flush_work_sync(&info->work);
 
        /* Free resources associated with old device channel. */
-       if (info->ring_ref != GRANT_INVALID_REF) {
-               gnttab_end_foreign_access(info->ring_ref, 0,
-                                         (unsigned long)info->ring.sring);
-               info->ring_ref = GRANT_INVALID_REF;
+       for (i = 0; i < info->num_ring_pages; i++) {
+               /* Free resources associated with old device channel. */
+               if (info->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0L);
+                       info->ring_ref[i] = GRANT_INVALID_REF;
+               }
+       }
+       if (info->ring.sring) {
+               int ring_area_size = info->num_ring_pages * PAGE_SIZE;
+               free_pages((unsigned long)info->ring.sring,
+                          get_order(ring_area_size));
                info->ring.sring = NULL;
        }
        if (info->irq)
@@ -705,27 +717,32 @@ static int setup_blkring(struct xenbus_device *dev,
                         struct blkfront_info *info)
 {
        struct blkif_sring *sring;
-       int err;
+       int i, order, err;
+       int ring_area_size = info->num_ring_pages * PAGE_SIZE;
 
-       info->ring_ref = GRANT_INVALID_REF;
+       for (i = 0; i < info->num_ring_pages; i++) {
+               info->ring_ref[i] = GRANT_INVALID_REF;
+       }
 
-       sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+       order = get_order(ring_area_size);
+       sring = (struct blkif_sring *)__get_free_pages(GFP_KERNEL, order);
        if (!sring) {
                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
-
-       sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
-       if (err < 0) {
-               free_page((unsigned long)sring);
-               info->ring.sring = NULL;
-               goto fail;
+       FRONT_RING_INIT(&info->ring, sring, ring_area_size);
+
+       for (i = 0; i < info->num_ring_pages; i++) {
+               unsigned long addr = (unsigned long)info->ring.sring + i * PAGE_SIZE;
+               err = xenbus_grant_ring(dev, virt_to_mfn(addr));
+               if (err < 0) {
+                       free_pages((unsigned long)sring, order);
+                       info->ring.sring = NULL;
+                       goto fail;
+               }
+               info->ring_ref[i] = err;
        }
-       info->ring_ref = err;
 
        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
@@ -754,7 +771,13 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
-       int err;
+       int err, i;
+
+       BUILD_BUG_ON(BLKIF_MAX_NUM_RING_PAGES != 1 &&
+              BLKIF_MAX_NUM_RING_PAGES != 2 &&
+              BLKIF_MAX_NUM_RING_PAGES != 4 &&
+              BLKIF_MAX_NUM_RING_PAGES != 8 &&
+              BLKIF_MAX_NUM_RING_PAGES != 16);
 
        /* Create shared ring, alloc event channel. */
        err = setup_blkring(dev, info);
@@ -768,11 +791,30 @@ again:
                goto destroy_blkring;
        }
 
-       err = xenbus_printf(xbt, dev->nodename,
-                           "ring-ref", "%u", info->ring_ref);
-       if (err) {
-               message = "writing ring-ref";
-               goto abort_transaction;
+       if (info->num_ring_pages == 1) {
+               err = xenbus_printf(xbt, dev->nodename,
+                               "ring-ref", "%u", info->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               err = xenbus_printf(xbt, dev->nodename, "num-ring-pages", "%u",
+                               info->num_ring_pages);
+               if (err) {
+                       message = "writing num-ring-pages";
+                       goto abort_transaction;
+               }
+               for (i = 0; i < info->num_ring_pages; i++) {
+                       char buf[16];
+                       snprintf(buf, sizeof(buf), "ring-ref%d", i);
+                       err = xenbus_printf(xbt, dev->nodename, buf, "%u",
+                                       info->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-refs";
+                               goto abort_transaction;
+                       }
+               }
        }
        err = xenbus_printf(xbt, dev->nodename,
                            "event-channel", "%u", info->evtchn);
@@ -820,6 +862,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 {
        int err, vdevice, i;
        struct blkfront_info *info;
+       int ring_size, max_ring_pages;
 
        /* FIXME: Use dynamic device id if this is not set. */
        err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -833,6 +876,10 @@ static int blkfront_probe(struct xenbus_device *dev,
                        return err;
                }
        }
+       err = xenbus_scanf(XBT_NIL, dev->otherend,
+                          "max-ring-pages", "%u", &max_ring_pages);
+       if (err != 1)
+               max_ring_pages = 1;
 
        if (xen_hvm_domain()) {
                char *type;
@@ -875,9 +922,13 @@ static int blkfront_probe(struct xenbus_device *dev,
        info->connected = BLKIF_STATE_DISCONNECTED;
        INIT_WORK(&info->work, blkif_restart_queue);
 
-       for (i = 0; i < BLK_RING_SIZE; i++)
+       info->num_ring_pages = min(max_ring_pages, BLKIF_MAX_NUM_RING_PAGES);
+
+       ring_size = __RING_SIZE((struct blkif_sring *)0,
+                               info->num_ring_pages * PAGE_SIZE);
+       for (i = 0; i < ring_size; i++)
                info->shadow[i].req.id = i+1;
-       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+       info->shadow[ring_size-1].req.id = 0x0fffffff;
 
        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -890,6 +941,9 @@ static int blkfront_probe(struct xenbus_device *dev,
                return err;
        }
 
+       printk(KERN_INFO "blkfront %s num-ring-pages %d nr_ents %d.\n",
+               dev->nodename, info->num_ring_pages, ring_size);
+
        return 0;
 }
 
@@ -900,6 +954,7 @@ static int blkif_recover(struct blkfront_info *info)
        struct blkif_request *req;
        struct blk_shadow *copy;
        int j;
+       int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE);
 
        /* Stage 1: Make a safe copy of the shadow state. */
        copy = kmalloc(sizeof(info->shadow),
@@ -910,13 +965,13 @@ static int blkif_recover(struct blkfront_info *info)
 
        /* Stage 2: Set up free list. */
        memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < ring_size; i++)
                info->shadow[i].req.id = i+1;
        info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+       info->shadow[ring_size-1].req.id = 0x0fffffff;
 
        /* Stage 3: Find pending requests and requeue them. */
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < ring_size; i++) {
                /* Not in use? */
                if (!copy[i].request)
                        continue;
@@ -1318,3 +1373,4 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
 MODULE_ALIAS("xen:vbd");
 MODULE_ALIAS("xenblk");
+
index c2d1fa4..610cbed 100644 (file)
@@ -87,6 +87,18 @@ struct blkif_response {
 
 DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
 
+/*
+ * Maximum number of pages used for a blkif ring
+ * max-ring-pages advertised by blkback to blkfront may be lowered at blkback
+ * mod load time.   Load time param set to default.
+ */
+#define BLKIF_MAX_NUM_RING_PAGES 16
+#define BLKIF_MAX_NUM_RING_PAGES_DFLT 4
+#if BLKIF_MAX_NUM_RING_PAGES < BLKIF_MAX_NUM_RING_PAGES_DFLT
+#undef BLKIF_MAX_NUM_RING_PAGES_DFLT
+#define BLKIF_MAX_NUM_RING_PAGES_DFLT BLKIF_MAX_NUM_RING_PAGES
+#endif
+
 #define VDISK_CDROM        0x1
 #define VDISK_REMOVABLE    0x2
 #define VDISK_READONLY     0x4