From: John Johansen Date: Wed, 15 Sep 2010 17:28:20 +0000 (-0700) Subject: UBUNTU: SAUCE: Improve Amazon EBS performance for EC2 X-Git-Url: http://git.alex.org.uk UBUNTU: SAUCE: Improve Amazon EBS performance for EC2 OriginalAuthor: Amazona from Ben Howard BugLink: http://bugs.launchpad.net/bugs/634316 The pv-ops kernel suffers from poor performance when using Amazon's Elastic block storage (EBS). This patch from Amazon improves pv-ops kernel performance, and has not exhibited any regressions. Signed-off-by: John Johansen Signed-off-by: Leann Ogasawara --- diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 61cc1ff..fcea517 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -72,7 +72,8 @@ struct blk_shadow { static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define BLK_MAX_RING_AREA_SIZE (BLKIF_MAX_NUM_RING_PAGES * PAGE_SIZE) +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, BLK_MAX_RING_AREA_SIZE) /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -87,14 +88,15 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref; + int num_ring_pages; + int ring_ref[BLKIF_MAX_NUM_RING_PAGES]; struct blkif_front_ring ring; struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_RING_SIZE]; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; unsigned long shadow_free; unsigned int feature_flush; int is_ready; @@ -126,7 +128,8 @@ static DEFINE_SPINLOCK(minor_lock); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE); + int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE); + BUG_ON(free >= ring_size); info->shadow_free = info->shadow[free].req.id; info->shadow[free].req.id = 0x0fffffee; /* debug */ return free; @@ -584,6 +587,8 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { + int i; + /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&blkif_io_lock); info->connected = suspend ? @@ -599,10 +604,17 @@ static void blkif_free(struct blkfront_info *info, int suspend) flush_work_sync(&info->work); /* Free resources associated with old device channel. */ - if (info->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref, 0, - (unsigned long)info->ring.sring); - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->num_ring_pages; i++) { + /* Free resources associated with old device channel. */ + if (info->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref[i], 0, 0L); + info->ring_ref[i] = GRANT_INVALID_REF; + } + } + if (info->ring.sring) { + int ring_area_size = info->num_ring_pages * PAGE_SIZE; + free_pages((unsigned long)info->ring.sring, + get_order(ring_area_size)); info->ring.sring = NULL; } if (info->irq) @@ -705,27 +717,32 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) { struct blkif_sring *sring; - int err; + int i, order, err; + int ring_area_size = info->num_ring_pages * PAGE_SIZE; - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->num_ring_pages; i++) { + info->ring_ref[i] = GRANT_INVALID_REF; + } - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + order = get_order(ring_area_size); + sring = (struct blkif_sring *)__get_free_pages(GFP_KERNEL, order); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); - if (err < 0) { - free_page((unsigned long)sring); - info->ring.sring = NULL; - goto fail; + FRONT_RING_INIT(&info->ring, sring, ring_area_size); + + for (i = 0; i < info->num_ring_pages; i++) { + unsigned long addr = (unsigned long)info->ring.sring + i * PAGE_SIZE; + err = xenbus_grant_ring(dev, virt_to_mfn(addr)); + if (err < 0) { + free_pages((unsigned long)sring, order); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref[i] = err; } - info->ring_ref = err; err = xenbus_alloc_evtchn(dev, &info->evtchn); if (err) @@ -754,7 +771,13 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err; + int err, i; + + BUILD_BUG_ON(BLKIF_MAX_NUM_RING_PAGES != 1 && + BLKIF_MAX_NUM_RING_PAGES != 2 && + BLKIF_MAX_NUM_RING_PAGES != 4 && + BLKIF_MAX_NUM_RING_PAGES != 8 && + BLKIF_MAX_NUM_RING_PAGES != 16); /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, info); @@ -768,11 +791,30 @@ again: goto destroy_blkring; } - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (info->num_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + err = xenbus_printf(xbt, dev->nodename, "num-ring-pages", "%u", + info->num_ring_pages); + if (err) { + message = "writing num-ring-pages"; + goto abort_transaction; + } + for (i = 0; i < info->num_ring_pages; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "ring-ref%d", i); + err = xenbus_printf(xbt, dev->nodename, buf, "%u", + info->ring_ref[i]); + if (err) { + message = "writing ring-refs"; + goto abort_transaction; + } + } } err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", info->evtchn); @@ -820,6 +862,7 @@ static int blkfront_probe(struct xenbus_device *dev, { int err, vdevice, i; struct blkfront_info *info; + int ring_size, max_ring_pages; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -833,6 +876,10 @@ static int blkfront_probe(struct xenbus_device *dev, return err; } } + err = xenbus_scanf(XBT_NIL, dev->otherend, + "max-ring-pages", "%u", &max_ring_pages); + if (err != 1) + max_ring_pages = 1; if (xen_hvm_domain()) { char *type; @@ -875,9 +922,13 @@ static int blkfront_probe(struct xenbus_device *dev, info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); - for (i = 0; i < BLK_RING_SIZE; i++) + info->num_ring_pages = min(max_ring_pages, BLKIF_MAX_NUM_RING_PAGES); + + ring_size = __RING_SIZE((struct blkif_sring *)0, + info->num_ring_pages * PAGE_SIZE); + for (i = 0; i < ring_size; i++) info->shadow[i].req.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + info->shadow[ring_size-1].req.id = 0x0fffffff; /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -890,6 +941,9 @@ static int blkfront_probe(struct xenbus_device *dev, return err; } + printk(KERN_INFO "blkfront %s num-ring-pages %d nr_ents %d.\n", + dev->nodename, info->num_ring_pages, ring_size); + return 0; } @@ -900,6 +954,7 @@ static int blkif_recover(struct blkfront_info *info) struct blkif_request *req; struct blk_shadow *copy; int j; + int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE); /* Stage 1: Make a safe copy of the shadow state. */ copy = kmalloc(sizeof(info->shadow), @@ -910,13 +965,13 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE; i++) + for (i = 0; i < ring_size; i++) info->shadow[i].req.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + info->shadow[ring_size-1].req.id = 0x0fffffff; /* Stage 3: Find pending requests and requeue them. */ - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < ring_size; i++) { /* Not in use? */ if (!copy[i].request) continue; @@ -1318,3 +1373,4 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); MODULE_ALIAS("xen:vbd"); MODULE_ALIAS("xenblk"); + diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c2d1fa4..610cbed 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -87,6 +87,18 @@ struct blkif_response { DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); +/* + * Maximum number of pages used for a blkif ring + * max-ring-pages advertised by blkback to blkfront may be lowered at blkback + * mod load time. Load time param set to default. + */ +#define BLKIF_MAX_NUM_RING_PAGES 16 +#define BLKIF_MAX_NUM_RING_PAGES_DFLT 4 +#if BLKIF_MAX_NUM_RING_PAGES < BLKIF_MAX_NUM_RING_PAGES_DFLT +#undef BLKIF_MAX_NUM_RING_PAGES_DFLT +#define BLKIF_MAX_NUM_RING_PAGES_DFLT BLKIF_MAX_NUM_RING_PAGES +#endif + #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4