aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin T. Gibbs <gibbs@FreeBSD.org>2012-03-01 19:09:28 +0000
committerJustin T. Gibbs <gibbs@FreeBSD.org>2012-03-01 19:09:28 +0000
commit2f121f392b1aad76e7662cb626ccd7cc4e605570 (patch)
tree4073a0f695e482e914bac1992acea5e98f4b401b
parent3d6e82d782e78e9def5b91a8bcf2b6ece79e4c7e (diff)
downloadsrc-2f121f392b1aad76e7662cb626ccd7cc4e605570.tar.gz
src-2f121f392b1aad76e7662cb626ccd7cc4e605570.zip
MFC r231743,231836-231837,231839,231883,232308
Xen PV block interface enhancements Approved by: re (kib) Reviewed by: cperciva Tested by: cperciva Sponsored by: Spectra Logic Corporation r231743 ======= Enhance documentation, improve interoperability, and fix defects in FreeBSD's front and back Xen blkif interface drivers. sys/dev/xen/blkfront/block.h: sys/dev/xen/blkfront/blkfront.c: sys/dev/xen/blkback/blkback.c: Replace FreeBSD specific multi-page ring impelementation with support for both the Citrix and Amazon/RedHat versions of this extension. sys/dev/xen/blkfront/blkfront.c: o Add a per-instance sysctl tree that exposes all negotiated transport parameters (ring pages, max number of requests, max request size, max number of segments). o In blkfront_vdevice_to_unit() add a missing return statement so that we properly identify the unit number for high numbered xvd devices. sys/dev/xen/blkback/blkback.c: o Add static dtrace probes for several events in this driver. o Defer connection shutdown processing until the front-end enters the closed state. This avoids prematurely tearing down the connection when buggy front-ends transition to the closing state, even though the device is open and they veto the close request from the tool stack. o Add nodes for maximum request size and the number of active ring pages to the exising, per-instance, sysctl tree. o Miscelaneous style cleanup. sys/xen/interface/io/blkif.h: o Add extensive documentation of the XenStore nodes used to implement the blkif interface. o Document the startup sequence between a front and back driver. o Add structures and documenatation for the "discard" feature (AKA Trim). o Cleanup some definitions related to FreeBSD's request number/size/segment-limit extension. sys/dev/xen/blkfront/blkfront.c: sys/dev/xen/blkback/blkback.c: sys/xen/xenbus/xenbusvar.h: Add the convenience function xenbus_get_otherend_state() and use it to simplify some logic in both block-front and block-back. r231836 ======= Fix "_" vs. "-" typo in a comment. No functional changes. r231837 ======= Fix typo in a printf string: "specificed" -> "specified". r231839 ======= Fix a bug in the calculation of the maximum I/O request size. The previous code did not limit the I/O request size based on the maximum number of segments supported by the back-end. In current practice, since the only back-end supporting chained requests is the FreeBSD implementation, this limit was never exceeded. sys/dev/xen/blkfront/block.h: Add two macros, XBF_SEGS_TO_SIZE() and XBF_SIZE_TO_SEGS(), to centralize the logic of reserving a segment to deal with non-page-aligned I/Os. sys/dev/xen/blkfront/blkfront.c: o When negotiating transfer parameters, limit the max_request_size we use and publish, if it is greater than the maximum, unaligned, I/O we can support with the number of segments advertised by the backend. o Don't unilaterally reduce the I/O size published to the disk layer by a single page. max_request_size is already properly limited in the transfer parameter negotiation code. o Fix typos in printf strings: "max_requests_segments" -> "max_request_segments" "specificed" -> "specified" r231883 ======= Fix regression in the handling of blkback close events for devices that are unplugged via QEMU. sys/dev/xen/blkback/blkback.c: Toolstack initiated closures change the frontend's state to Closing. The backend must change to Closing as well, even if we can't actually close yet, in order for the frontend to notice and start the closing process. r232308 ======= blkif interface comment cleanups. No functional changes sys/xen/interface/io/blkif.h: o Insert space in "Red Hat". o Fix typo "discard-aligment" -> "discard-alignment" o Fix typo "unamp" -> "unmap" o Fix typo "formated" -> "formatted" o Clarify the text for "params". o Clarify the text for "sector-size". o Clarify the text for "max-requests" in the backend section.
Notes
Notes: svn path=/stable/8/; revision=232352
-rw-r--r--sys/dev/xen/blkback/blkback.c207
-rw-r--r--sys/dev/xen/blkfront/blkfront.c165
-rw-r--r--sys/dev/xen/blkfront/block.h40
-rw-r--r--sys/xen/interface/io/blkif.h500
-rw-r--r--sys/xen/xenbus/xenbusvar.h14
5 files changed, 787 insertions, 139 deletions
diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
index 53d272ed52db..859db95edd28 100644
--- a/sys/dev/xen/blkback/blkback.c
+++ b/sys/dev/xen/blkback/blkback.c
@@ -40,6 +40,8 @@ __FBSDID("$FreeBSD$");
* a FreeBSD domain to other domains.
*/
+#include "opt_kdtrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -63,6 +65,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/bitstring.h>
+#include <sys/sdt.h>
#include <geom/geom.h>
@@ -124,7 +127,7 @@ __FBSDID("$FreeBSD$");
MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
#ifdef XBB_DEBUG
-#define DPRINTF(fmt, args...) \
+#define DPRINTF(fmt, args...) \
printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
#else
#define DPRINTF(fmt, args...) do {} while(0)
@@ -134,7 +137,7 @@ MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
* The maximum mapped region size per request we will allow in a negotiated
* block-front/back communication channel.
*/
-#define XBB_MAX_REQUEST_SIZE \
+#define XBB_MAX_REQUEST_SIZE \
MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
/**
@@ -142,9 +145,9 @@ MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
* segment blocks) per request we will allow in a negotiated block-front/back
* communication channel.
*/
-#define XBB_MAX_SEGMENTS_PER_REQUEST \
- (MIN(UIO_MAXIOV, \
- MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
+#define XBB_MAX_SEGMENTS_PER_REQUEST \
+ (MIN(UIO_MAXIOV, \
+ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
(XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
/**
@@ -980,9 +983,10 @@ xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
static uint8_t *
xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
{
- intptr_t first_clear, num_clear;
+ intptr_t first_clear;
+ intptr_t num_clear;
uint8_t *free_kva;
- int i;
+ int i;
KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
@@ -1681,19 +1685,19 @@ xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
req_ring_idx++;
switch (xbb->abi) {
case BLKIF_PROTOCOL_NATIVE:
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
+ req_ring_idx);
break;
case BLKIF_PROTOCOL_X86_32:
{
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
+ req_ring_idx);
break;
}
case BLKIF_PROTOCOL_X86_64:
{
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
+ req_ring_idx);
break;
}
default:
@@ -1817,8 +1821,8 @@ xbb_run_queue(void *context, int pending)
struct xbb_xen_reqlist *reqlist;
- xbb = (struct xbb_softc *)context;
- rings = &xbb->rings;
+ xbb = (struct xbb_softc *)context;
+ rings = &xbb->rings;
/*
* Work gather and dispatch loop. Note that we have a bias here
@@ -2032,6 +2036,13 @@ xbb_intr(void *arg)
taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
+SDT_PROVIDER_DEFINE(xbb);
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
+ "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
+ "uint64_t", "uint64_t");
+
/*----------------------------- Backend Handlers -----------------------------*/
/**
* Backend handler for character device access.
@@ -2087,6 +2098,9 @@ xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
nreq->pendcnt = 1;
+ SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
+ device_get_unit(xbb->dev));
+
(*dev_data->csw->d_strategy)(bio);
return (0);
@@ -2181,6 +2195,17 @@ xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
bios[bio_idx]->bio_bcount);
}
#endif
+ if (operation == BIO_READ) {
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
+ device_get_unit(xbb->dev),
+ bios[bio_idx]->bio_offset,
+ bios[bio_idx]->bio_length);
+ } else if (operation == BIO_WRITE) {
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
+ device_get_unit(xbb->dev),
+ bios[bio_idx]->bio_offset,
+ bios[bio_idx]->bio_length);
+ }
(*dev_data->csw->d_strategy)(bios[bio_idx]);
}
@@ -2193,6 +2218,12 @@ fail_free_bios:
return (error);
}
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
+ "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
+ "uint64_t", "uint64_t");
+
/**
* Backend handler for file access.
*
@@ -2237,6 +2268,9 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
case BIO_FLUSH: {
struct mount *mountpoint;
+ SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
+ device_get_unit(xbb->dev));
+
vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
@@ -2336,6 +2370,10 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
switch (operation) {
case BIO_READ:
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
+ device_get_unit(xbb->dev), xuio.uio_offset,
+ xuio.uio_resid);
+
vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
/*
@@ -2366,6 +2404,10 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
case BIO_WRITE: {
struct mount *mountpoint;
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
+ device_get_unit(xbb->dev), xuio.uio_offset,
+ xuio.uio_resid);
+
(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
@@ -3028,6 +3070,8 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
const char *otherend_path;
int error;
u_int ring_idx;
+ u_int ring_page_order;
+ size_t ring_size;
otherend_path = xenbus_get_otherend_path(xbb->dev);
@@ -3035,23 +3079,19 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
* Protocol defaults valid even if all negotiation fails.
*/
xbb->ring_config.ring_pages = 1;
- xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE;
/*
* Mandatory data (used in all versions of the protocol) first.
*/
- error = xs_gather(XST_NIL, otherend_path,
- "ring-ref", "%" PRIu32,
- &xbb->ring_config.ring_ref[0],
- "event-channel", "%" PRIu32,
- &xbb->ring_config.evtchn,
- NULL);
+ error = xs_scanf(XST_NIL, otherend_path,
+ "event-channel", NULL, "%" PRIu32,
+ &xbb->ring_config.evtchn);
if (error != 0) {
xenbus_dev_fatal(xbb->dev, error,
- "Unable to retrieve ring information from "
- "frontend %s. Unable to connect.",
+ "Unable to retrieve event-channel information "
+ "from frontend %s. Unable to connect.",
xenbus_get_otherend_path(xbb->dev));
return (error);
}
@@ -3065,10 +3105,20 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
* we must use independant calls in order to guarantee
* we don't miss information in a sparsly populated front-end
* tree.
+ *
+ * \note xs_scanf() does not update variables for unmatched
+ * fields.
*/
+ ring_page_order = 0;
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "ring-page-order", NULL, "%u",
+ &ring_page_order);
+ xbb->ring_config.ring_pages = 1 << ring_page_order;
(void)xs_scanf(XST_NIL, otherend_path,
- "ring-pages", NULL, "%u",
+ "num-ring-pages", NULL, "%u",
&xbb->ring_config.ring_pages);
+ ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
+ xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
(void)xs_scanf(XST_NIL, otherend_path,
"max-requests", NULL, "%u",
@@ -3084,7 +3134,7 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
xenbus_dev_fatal(xbb->dev, EINVAL,
- "Front-end specificed ring-pages of %u "
+ "Front-end specified ring-pages of %u "
"exceeds backend limit of %zu. "
"Unable to connect.",
xbb->ring_config.ring_pages,
@@ -3092,7 +3142,7 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
return (EINVAL);
} else if (xbb->max_requests > XBB_MAX_REQUESTS) {
xenbus_dev_fatal(xbb->dev, EINVAL,
- "Front-end specificed max_requests of %u "
+ "Front-end specified max_requests of %u "
"exceeds backend limit of %u. "
"Unable to connect.",
xbb->max_requests,
@@ -3100,7 +3150,7 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
return (EINVAL);
} else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
xenbus_dev_fatal(xbb->dev, EINVAL,
- "Front-end specificed max_requests_segments "
+ "Front-end specified max_requests_segments "
"of %u exceeds backend limit of %u. "
"Unable to connect.",
xbb->max_request_segments,
@@ -3108,7 +3158,7 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
return (EINVAL);
} else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
xenbus_dev_fatal(xbb->dev, EINVAL,
- "Front-end specificed max_request_size "
+ "Front-end specified max_request_size "
"of %u exceeds backend limit of %u. "
"Unable to connect.",
xbb->max_request_size,
@@ -3116,22 +3166,39 @@ xbb_collect_frontend_info(struct xbb_softc *xbb)
return (EINVAL);
}
- /* If using a multi-page ring, pull in the remaining references. */
- for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
- char ring_ref_name[]= "ring_refXX";
-
- snprintf(ring_ref_name, sizeof(ring_ref_name),
- "ring-ref%u", ring_idx);
- error = xs_scanf(XST_NIL, otherend_path,
- ring_ref_name, NULL, "%" PRIu32,
- &xbb->ring_config.ring_ref[ring_idx]);
+ if (xbb->ring_config.ring_pages == 1) {
+ error = xs_gather(XST_NIL, otherend_path,
+ "ring-ref", "%" PRIu32,
+ &xbb->ring_config.ring_ref[0],
+ NULL);
if (error != 0) {
xenbus_dev_fatal(xbb->dev, error,
- "Failed to retriev grant reference "
- "for page %u of shared ring. Unable "
- "to connect.", ring_idx);
+ "Unable to retrieve ring information "
+ "from frontend %s. Unable to "
+ "connect.",
+ xenbus_get_otherend_path(xbb->dev));
return (error);
}
+ } else {
+ /* Multi-page ring format. */
+ for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name),
+ "ring-ref%u", ring_idx);
+ error = xs_scanf(XST_NIL, otherend_path,
+ ring_ref_name, NULL, "%" PRIu32,
+ &xbb->ring_config.ring_ref[ring_idx]);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Failed to retriev grant "
+ "reference for page %u of "
+ "shared ring. Unable "
+ "to connect.", ring_idx);
+ return (error);
+ }
+ }
}
error = xs_gather(XST_NIL, otherend_path,
@@ -3197,8 +3264,8 @@ xbb_alloc_requests(struct xbb_softc *xbb)
static int
xbb_alloc_request_lists(struct xbb_softc *xbb)
{
- int i;
struct xbb_xen_reqlist *reqlist;
+ int i;
/*
* If no requests can be merged, we need 1 request list per
@@ -3318,7 +3385,7 @@ xbb_publish_backend_info(struct xbb_softc *xbb)
static void
xbb_connect(struct xbb_softc *xbb)
{
- int error;
+ int error;
if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
return;
@@ -3399,7 +3466,8 @@ xbb_connect(struct xbb_softc *xbb)
static int
xbb_shutdown(struct xbb_softc *xbb)
{
- int error;
+ XenbusState frontState;
+ int error;
DPRINTF("\n");
@@ -3413,6 +3481,20 @@ xbb_shutdown(struct xbb_softc *xbb)
if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
return (EAGAIN);
+ xbb->flags |= XBBF_IN_SHUTDOWN;
+ mtx_unlock(&xbb->lock);
+
+ if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
+ xenbus_set_state(xbb->dev, XenbusStateClosing);
+
+ frontState = xenbus_get_otherend_state(xbb->dev);
+ mtx_lock(&xbb->lock);
+ xbb->flags &= ~XBBF_IN_SHUTDOWN;
+
+ /* The front can submit I/O until entering the closed state. */
+ if (frontState < XenbusStateClosed)
+ return (EAGAIN);
+
DPRINTF("\n");
/* Indicate shutdown is in progress. */
@@ -3434,19 +3516,6 @@ xbb_shutdown(struct xbb_softc *xbb)
DPRINTF("\n");
- /*
- * Before unlocking mutex, set this flag to prevent other threads from
- * getting into this function
- */
- xbb->flags |= XBBF_IN_SHUTDOWN;
- mtx_unlock(&xbb->lock);
-
- if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
- xenbus_set_state(xbb->dev, XenbusStateClosing);
-
- mtx_lock(&xbb->lock);
- xbb->flags &= ~XBBF_IN_SHUTDOWN;
-
/* Indicate to xbb_detach() that is it safe to proceed. */
wakeup(xbb);
@@ -3573,6 +3642,16 @@ xbb_setup_sysctl(struct xbb_softc *xbb)
"max_request_segments", CTLFLAG_RD,
&xbb->max_request_segments, 0,
"maximum number of pages per requests (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "max_request_size", CTLFLAG_RD,
+ &xbb->max_request_size, 0,
+ "maximum size in bytes of a request (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "ring_pages", CTLFLAG_RD,
+ &xbb->ring_config.ring_pages, 0,
+ "communication channel pages (negotiated)");
}
/**
@@ -3587,6 +3666,7 @@ xbb_attach(device_t dev)
{
struct xbb_softc *xbb;
int error;
+ u_int max_ring_page_order;
DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
@@ -3621,6 +3701,10 @@ xbb_attach(device_t dev)
return (error);
}
+ /*
+ * Amazon EC2 client compatility. They refer to max-ring-pages
+ * instead of to max-ring-page-order.
+ */
error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
"max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
if (error) {
@@ -3629,6 +3713,15 @@ xbb_attach(device_t dev)
return (error);
}
+ max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-ring-page-order", "%u", max_ring_page_order);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
"max-requests", "%u", XBB_MAX_REQUESTS);
if (error) {
diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
index fb530f3f13f3..b2f8909ab3bb 100644
--- a/sys/dev/xen/blkfront/blkfront.c
+++ b/sys/dev/xen/blkfront/blkfront.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/module.h>
+#include <sys/sysctl.h>
#include <machine/bus.h>
#include <sys/rman.h>
@@ -139,7 +140,7 @@ static int xb_dump(void *, void *, vm_offset_t, off_t, size_t);
* with blkfront as the emulated drives, easing transition slightly.
*/
static void
-blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
+blkfront_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name)
{
static struct vdev_info {
int major;
@@ -186,6 +187,7 @@ blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
if (vdevice & (1 << 28)) {
*unit = (vdevice & ((1 << 28) - 1)) >> 8;
*name = "xbd";
+ return;
}
for (i = 0; info[i].major; i++) {
@@ -407,6 +409,40 @@ blkfront_probe(device_t dev)
return (ENXIO);
}
+static void
+xb_setup_sysctl(struct xb_softc *xb)
+{
+ struct sysctl_ctx_list *sysctl_ctx = NULL;
+ struct sysctl_oid *sysctl_tree = NULL;
+
+ sysctl_ctx = device_get_sysctl_ctx(xb->xb_dev);
+ if (sysctl_ctx == NULL)
+ return;
+
+ sysctl_tree = device_get_sysctl_tree(xb->xb_dev);
+ if (sysctl_tree == NULL)
+ return;
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "max_requests", CTLFLAG_RD, &xb->max_requests, -1,
+ "maximum outstanding requests (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "max_request_segments", CTLFLAG_RD,
+ &xb->max_request_segments, 0,
+ "maximum number of pages per requests (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "max_request_size", CTLFLAG_RD,
+ &xb->max_request_size, 0,
+ "maximum size in bytes of a request (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "ring_pages", CTLFLAG_RD,
+ &xb->ring_pages, 0,
+ "communication channel pages (negotiated)");
+}
+
/*
* Setup supplies the backend dir, virtual device. We place an event
* channel and shared frame entries. We watch backend to wait if it's
@@ -417,14 +453,14 @@ blkfront_attach(device_t dev)
{
struct xb_softc *sc;
const char *name;
+ uint32_t vdevice;
int error;
- int vdevice;
int i;
int unit;
/* FIXME: Use dynamic device id if this is not set. */
error = xs_scanf(XST_NIL, xenbus_get_node(dev),
- "virtual-device", NULL, "%i", &vdevice);
+ "virtual-device", NULL, "%" PRIu32, &vdevice);
if (error) {
xenbus_dev_fatal(dev, error, "reading virtual-device");
device_printf(dev, "Couldn't determine virtual device.\n");
@@ -449,6 +485,8 @@ blkfront_attach(device_t dev)
sc->vdevice = vdevice;
sc->connected = BLKIF_STATE_DISCONNECTED;
+ xb_setup_sysctl(sc);
+
/* Wait for backend device to publish its protocol capabilities. */
xenbus_set_state(dev, XenbusStateInitialising);
@@ -501,6 +539,7 @@ blkfront_initialize(struct xb_softc *sc)
{
const char *otherend_path;
const char *node_path;
+ uint32_t max_ring_page_order;
int error;
int i;
@@ -513,10 +552,10 @@ blkfront_initialize(struct xb_softc *sc)
* Protocol defaults valid even if negotiation for a
* setting fails.
*/
+ max_ring_page_order = 0;
sc->ring_pages = 1;
- sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
- sc->max_request_size = (sc->max_request_segments - 1) * PAGE_SIZE;
+ sc->max_request_size = XBF_SEGS_TO_SIZE(sc->max_request_segments);
sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
/*
@@ -526,13 +565,25 @@ blkfront_initialize(struct xb_softc *sc)
* we must use independant calls in order to guarantee
* we don't miss information in a sparsly populated back-end
* tree.
+ *
+ * \note xs_scanf() does not update variables for unmatched
+ * fields.
*/
otherend_path = xenbus_get_otherend_path(sc->xb_dev);
node_path = xenbus_get_node(sc->xb_dev);
+
+ /* Support both backend schemes for relaying ring page limits. */
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-ring-page-order", NULL, "%" PRIu32,
+ &max_ring_page_order);
+ sc->ring_pages = 1 << max_ring_page_order;
(void)xs_scanf(XST_NIL, otherend_path,
"max-ring-pages", NULL, "%" PRIu32,
&sc->ring_pages);
+ if (sc->ring_pages < 1)
+ sc->ring_pages = 1;
+ sc->max_requests = BLKIF_MAX_RING_REQUESTS(sc->ring_pages * PAGE_SIZE);
(void)xs_scanf(XST_NIL, otherend_path,
"max-requests", NULL, "%" PRIu32,
&sc->max_requests);
@@ -552,6 +603,16 @@ blkfront_initialize(struct xb_softc *sc)
sc->ring_pages = XBF_MAX_RING_PAGES;
}
+ if (powerof2(sc->ring_pages) == 0) {
+ uint32_t new_page_limit;
+
+ new_page_limit = 0x01 << (fls(sc->ring_pages) - 1);
+ device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+ "%u is not a power of 2. Limited to %u.\n",
+ sc->ring_pages, new_page_limit);
+ sc->ring_pages = new_page_limit;
+ }
+
if (sc->max_requests > XBF_MAX_REQUESTS) {
device_printf(sc->xb_dev, "Back-end specified max_requests of "
"%u limited to front-end limit of %u.\n",
@@ -560,8 +621,8 @@ blkfront_initialize(struct xb_softc *sc)
}
if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) {
- device_printf(sc->xb_dev, "Back-end specificed "
- "max_requests_segments of %u limited to "
+ device_printf(sc->xb_dev, "Back-end specified "
+ "max_request_segments of %u limited to "
"front-end limit of %u.\n",
sc->max_request_segments,
XBF_MAX_SEGMENTS_PER_REQUEST);
@@ -569,12 +630,23 @@ blkfront_initialize(struct xb_softc *sc)
}
if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) {
- device_printf(sc->xb_dev, "Back-end specificed "
+ device_printf(sc->xb_dev, "Back-end specified "
"max_request_size of %u limited to front-end "
"limit of %u.\n", sc->max_request_size,
XBF_MAX_REQUEST_SIZE);
sc->max_request_size = XBF_MAX_REQUEST_SIZE;
}
+
+ if (sc->max_request_size > XBF_SEGS_TO_SIZE(sc->max_request_segments)) {
+ device_printf(sc->xb_dev, "Back-end specified "
+ "max_request_size of %u limited to front-end "
+ "limit of %u. (Too few segments.)\n",
+ sc->max_request_size,
+ XBF_SEGS_TO_SIZE(sc->max_request_segments));
+ sc->max_request_size =
+ XBF_SEGS_TO_SIZE(sc->max_request_segments);
+ }
+
sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
/* Allocate datastructures based on negotiated values. */
@@ -625,11 +697,20 @@ blkfront_initialize(struct xb_softc *sc)
if (setup_blkring(sc) != 0)
return;
+ /* Support both backend schemes for relaying ring page limits. */
error = xs_printf(XST_NIL, node_path,
- "ring-pages","%u", sc->ring_pages);
+ "num-ring-pages","%u", sc->ring_pages);
if (error) {
xenbus_dev_fatal(sc->xb_dev, error,
- "writing %s/ring-pages",
+ "writing %s/num-ring-pages",
+ node_path);
+ return;
+ }
+ error = xs_printf(XST_NIL, node_path,
+ "ring-page-order","%u", fls(sc->ring_pages) - 1);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/ring-page-order",
node_path);
return;
}
@@ -711,25 +792,31 @@ setup_blkring(struct xb_softc *sc)
return (error);
}
}
- error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
- "ring-ref","%u", sc->ring_ref[0]);
- if (error) {
- xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
- xenbus_get_node(sc->xb_dev));
- return (error);
- }
- for (i = 1; i < sc->ring_pages; i++) {
- char ring_ref_name[]= "ring_refXX";
-
- snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+ if (sc->ring_pages == 1) {
error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
- ring_ref_name, "%u", sc->ring_ref[i]);
+ "ring-ref", "%u", sc->ring_ref[0]);
if (error) {
- xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
- xenbus_get_node(sc->xb_dev),
- ring_ref_name);
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/ring-ref",
+ xenbus_get_node(sc->xb_dev));
return (error);
}
+ } else {
+ for (i = 0; i < sc->ring_pages; i++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name),
+ "ring-ref%u", i);
+ error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+ ring_ref_name, "%u", sc->ring_ref[i]);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/%s",
+ xenbus_get_node(sc->xb_dev),
+ ring_ref_name);
+ return (error);
+ }
+ }
}
error = bind_listening_port_to_irqhandler(
@@ -795,7 +882,7 @@ blkfront_connect(struct xb_softc *sc)
unsigned int binfo;
int err, feature_barrier;
- if( (sc->connected == BLKIF_STATE_CONNECTED) ||
+ if( (sc->connected == BLKIF_STATE_CONNECTED) ||
(sc->connected == BLKIF_STATE_SUSPENDED) )
return;
@@ -923,15 +1010,13 @@ blkif_close(struct disk *dp)
return (ENXIO);
sc->xb_flags &= ~XB_OPEN;
if (--(sc->users) == 0) {
- /* Check whether we have been instructed to close. We will
- have ignored this request initially, as the device was
- still mounted. */
- device_t dev = sc->xb_dev;
- XenbusState state =
- xenbus_read_driver_state(xenbus_get_otherend_path(dev));
-
- if (state == XenbusStateClosing)
- blkfront_closing(dev);
+ /*
+ * Check whether we have been instructed to close. We will
+ * have ignored this request initially, as the device was
+ * still mounted.
+ */
+ if (xenbus_get_otherend_state(sc->xb_dev) == XenbusStateClosing)
+ blkfront_closing(sc->xb_dev);
}
return (0);
}
@@ -1033,7 +1118,7 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
struct xb_command *cm;
blkif_request_t *ring_req;
struct blkif_request_segment *sg;
- struct blkif_request_segment *last_block_sg;
+ struct blkif_request_segment *last_block_sg;
grant_ref_t *sg_ref;
vm_paddr_t buffer_ma;
uint64_t fsect, lsect;
@@ -1104,12 +1189,12 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
nsegs--;
}
block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
- if (block_segs == 0)
- break;
+ if (block_segs == 0)
+ break;
- sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+ sg = BLKRING_GET_SEG_BLOCK(&sc->ring, sc->ring.req_prod_pvt);
sc->ring.req_prod_pvt++;
- last_block_sg = sg + block_segs;
+ last_block_sg = sg + block_segs;
}
if (cm->operation == BLKIF_OP_READ)
diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h
index 6eabcf4e0723..5aa35ae54270 100644
--- a/sys/dev/xen/blkfront/block.h
+++ b/sys/dev/xen/blkfront/block.h
@@ -35,6 +35,32 @@
#include <xen/blkif.h>
/**
+ * Given a number of blkif segments, compute the maximum I/O size supported.
+ *
+ * \note This calculation assumes that all but the first and last segments
+ * of the I/O are fully utilized.
+ *
+ * \note We reserve a segement from the maximum supported by the transport to
+ * guarantee we can handle an unaligned transfer without the need to
+ * use a bounce buffer.
+ */
+#define XBF_SEGS_TO_SIZE(segs) \
+ (((segs) - 1) * PAGE_SIZE)
+
+/**
+ * Compute the maximum number of blkif segments requried to represent
+ * an I/O of the given size.
+ *
+ * \note This calculation assumes that all but the first and last segments
+ * of the I/O are fully utilized.
+ *
+ * \note We reserve a segement to guarantee we can handle an unaligned
+ * transfer without the need to use a bounce buffer.
+ */
+#define XBF_SIZE_TO_SEGS(size) \
+ ((size / PAGE_SIZE) + 1)
+
+/**
* The maximum number of outstanding requests blocks (request headers plus
* additional segment blocks) we will allow in a negotiated block-front/back
* communication channel.
@@ -44,22 +70,18 @@
/**
* The maximum mapped region size per request we will allow in a negotiated
* block-front/back communication channel.
- *
- * \note We reserve a segement from the maximum supported by the transport to
- * guarantee we can handle an unaligned transfer without the need to
- * use a bounce buffer..
*/
-#define XBF_MAX_REQUEST_SIZE \
- MIN(MAXPHYS, (BLKIF_MAX_SEGMENTS_PER_REQUEST - 1) * PAGE_SIZE)
+#define XBF_MAX_REQUEST_SIZE \
+ MIN(MAXPHYS, XBF_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST))
/**
* The maximum number of segments (within a request header and accompanying
* segment blocks) per request we will allow in a negotiated block-front/back
* communication channel.
*/
-#define XBF_MAX_SEGMENTS_PER_REQUEST \
- (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
- (XBF_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))
+#define XBF_MAX_SEGMENTS_PER_REQUEST \
+ (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
+ XBF_SIZE_TO_SEGS(XBF_MAX_REQUEST_SIZE)))
/**
* The maximum number of shared memory ring pages we will allow in a
diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h
index 020936b1f539..b6c930b26808 100644
--- a/sys/xen/interface/io/blkif.h
+++ b/sys/xen/interface/io/blkif.h
@@ -1,8 +1,8 @@
/******************************************************************************
* blkif.h
- *
+ *
* Unified block-device I/O interface for Xen guest OSes.
- *
+ *
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
@@ -22,6 +22,7 @@
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
*/
#ifndef __XEN_PUBLIC_IO_BLKIF_H__
@@ -35,7 +36,7 @@
* notification can be made conditional on req_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Backends must set
* req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
- *
+ *
* Back->front notifications: When enqueuing a new response, sending a
* notification can be made conditional on rsp_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Frontends must set
@@ -48,37 +49,417 @@
#define blkif_sector_t uint64_t
/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters. This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * All data in the XenStore is stored as strings. Nodes specifying numeric
+ * values are encoded in decimal. Integer value ranges listed below are
+ * expressed as fixed sized integer types capable of storing the conversion
+ * of a properly formatted node string, without loss of information.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ * XenStore nodes marked "DEPRECATED" in their notes section should only be
+ * used to provide interoperability with legacy implementations.
+ *
+ * See the XenBus state transition diagram below for details on when XenBus
+ * nodes must be published and when they can be queried.
+ *
+ *****************************************************************************
+ * Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * mode
+ * Values: "r" (read only), "w" (writable)
+ *
+ * The read or write access permissions to the backing store to be
+ * granted to the frontend.
+ *
+ * params
+ * Values: string
+ *
+ * Data used by the backend driver to locate and configure the backing
+ * device. The format and semantics of this data vary according to the
+ * backing device in use and are outside the scope of this specification.
+ *
+ * type
+ * Values: "file", "phy", "tap"
+ *
+ * The type of the backing device/object.
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-barrier
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_WRITE_BARRIER request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-flush-cache
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_FLUSH_DISKCACHE request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-discard
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_DISCARD request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ *----------------------- Request Transport Parameters ------------------------
+ *
+ * max-ring-page-order
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Notes: 1, 3
+ *
+ * The maximum supported size of the request ring buffer in units of
+ * lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ * etc.).
+ *
+ * max-ring-pages
+ * Values: <uint32_t>
+ * Default Value: 1
+ * Notes: DEPRECATED, 2, 3
+ *
+ * The maximum supported size of the request ring buffer in units of
+ * machine pages. The value must be a power of 2.
+ *
+ * max-requests <uint32_t>
+ * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
+ *
+ * The maximum number of concurrent, logical requests supported by
+ * the backend.
+ *
+ * Note: A logical request may span multiple ring entries.
+ *
+ * max-request-segments
+ * Values: <uint8_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST
+ *
+ * The maximum value of blkif_request.nr_segments supported by
+ * the backend.
+ *
+ * max-request-size
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ * Maximum Value: BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE
+ *
+ * The maximum amount of data, in bytes, that can be referenced by a
+ * request type that accesses frontend memory (currently BLKIF_OP_READ,
+ * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *------------------------- Backend Device Properties -------------------------
+ *
+ * discard-alignment
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Notes: 4, 5
+ *
+ * The offset, in bytes from the beginning of the virtual block device,
+ * to the first, addressable, discard extent on the underlying device.
+ *
+ * discard-granularity
+ * Values: <uint32_t>
+ * Default Value: <"sector-size">
+ * Notes: 4
+ *
+ * The size, in bytes, of the individually addressable discard extents
+ * of the underlying device.
+ *
+ * discard-secure
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
+ * requests with the BLKIF_DISCARD_SECURE flag set.
+ *
+ * info
+ * Values: <uint32_t> (bitmap)
+ *
+ * A collection of bit flags describing attributes of the backing
+ * device. The VDISK_* macros define the meaning of each bit
+ * location.
+ *
+ * sector-size
+ * Values: <uint32_t>
+ *
+ * The size, in bytes, of the individually addressible data blocks
+ * on the backend device.
+ *
+ * sectors
+ * Values: <uint64_t>
+ *
+ * The size of the backend device, expressed in units of its native
+ * sector size ("sector-size").
+ *
+ *****************************************************************************
+ * Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ * Values: <uint32_t>
+ *
+ * The identifier of the Xen event channel used to signal activity
+ * in the ring buffer.
+ *
+ * ring-ref
+ * Values: <uint32_t>
+ * Notes: 6
+ *
+ * The Xen grant reference granting permission for the backend to map
+ * the sole page in a single page sized ring buffer.
+ *
+ * ring-ref%u
+ * Values: <uint32_t>
+ * Notes: 6
+ *
+ * For a frontend providing a multi-page ring, a "number of ring pages"
+ * sized list of nodes, each containing a Xen grant reference granting
+ * permission for the backend to map the page of the ring located
+ * at page index "%u". Page indexes are zero based.
+ *
+ * protocol
+ * Values: string (XEN_IO_PROTO_ABI_*)
+ * Default Value: XEN_IO_PROTO_ABI_NATIVE
+ *
+ * The machine ABI rules governing the format of all ring request and
+ * response structures.
+ *
+ * ring-page-order
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Maximum Value: MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
+ * Notes: 1, 3
+ *
+ * The size of the frontend allocated request ring buffer in units
+ * of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ * etc.).
+ *
+ * num-ring-pages
+ * Values: <uint32_t>
+ * Default Value: 1
+ * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order))
+ * Notes: DEPRECATED, 2, 3
+ *
+ * The size of the frontend allocated request ring buffer in units of
+ * machine pages. The value must be a power of 2.
+ *
+ * max-requests
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
+ *
+ * The maximum number of concurrent, logical requests that will be
+ * issued by the frontend.
+ *
+ * Note: A logical request may span multiple ring entries.
+ *
+ * max-request-segments
+ * Values: <uint8_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ * Maximum Value: MIN(255, backend/max-request-segments)
+ *
+ * The maximum value the frontend will set in the
+ * blkif_request.nr_segments field.
+ *
+ * max-request-size
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ * Maximum Value: max-request-segments * PAGE_SIZE
+ *
+ * The maximum amount of data, in bytes, that can be referenced by
+ * a request type that accesses frontend memory (currently BLKIF_OP_READ,
+ * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *------------------------- Virtual Device Properties -------------------------
+ *
+ * device-type
+ * Values: "disk", "cdrom", "floppy", etc.
+ *
+ * virtual-device
+ * Values: <uint32_t>
+ *
+ * A value indicating the physical device to virtualize within the
+ * frontend's domain. (e.g. "The first ATA disk", "The third SCSI
+ * disk", etc.)
+ *
+ * See docs/misc/vbd-interface.txt for details on the format of this
+ * value.
+ *
+ * Notes
+ * -----
+ * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
+ * PV drivers.
+ * (2) Multi-page ring buffer scheme first used in some Red Hat distributions
+ * including a distribution deployed on certain nodes of the Amazon
+ * EC2 cluster.
+ * (3) Support for multi-page ring buffers was implemented independently,
+ * in slightly different forms, by both Citrix and Red Hat/Amazon.
+ * For full interoperability, block front and backends should publish
+ * identical ring parameters, adjusted for unit differences, to the
+ * XenStore nodes used in both schemes.
+ * (4) Devices that support discard functionality may internally allocate
+ * space (discardable extents) in units that are larger than the
+ * exported logical block size.
+ * (5) The discard-alignment parameter allows a physical device to be
+ * partitioned into virtual devices that do not necessarily begin or
+ * end on a discardable extent boundary.
+ * (6) When there is only a single page allocated to the request ring,
+ * 'ring-ref' is used to communicate the grant reference for this
+ * page to the backend. When using a multi-page ring, the 'ring-ref'
+ * node is not created. Instead 'ring-ref0' - 'ring-refN' are used.
+ */
+
+/*
+ * STATE DIAGRAMS
+ *
+ *****************************************************************************
+ * Startup *
+ *****************************************************************************
+ *
+ * Tool stack creates front and back nodes with state XenbusStateInitialising.
+ *
+ * Front Back
+ * ================================= =====================================
+ * XenbusStateInitialising XenbusStateInitialising
+ * o Query virtual device o Query backend device identification
+ * properties. data.
+ * o Setup OS device instance. o Open and validate backend device.
+ * o Publish backend features and
+ * transport parameters.
+ * |
+ * |
+ * V
+ * XenbusStateInitWait
+ *
+ * o Query backend features and
+ * transport parameters.
+ * o Allocate and initialize the
+ * request ring.
+ * o Publish transport parameters
+ * that will be in effect during
+ * this connection.
+ * |
+ * |
+ * V
+ * XenbusStateInitialised
+ *
+ * o Query frontend transport parameters.
+ * o Connect to the request ring and
+ * event channel.
+ * o Publish backend device properties.
+ * |
+ * |
+ * V
+ * XenbusStateConnected
+ *
+ * o Query backend device properties.
+ * o Finalize OS virtual device
+ * instance.
+ * |
+ * |
+ * V
+ * XenbusStateConnected
+ *
+ * Note: Drivers that do not support any optional features, or the negotiation
+ * of transport parameters, can skip certain states in the state machine:
+ *
+ * o A frontend may transition to XenbusStateInitialised without
+ * waiting for the backend to enter XenbusStateInitWait. In this
+ * case, default transport parameters are in effect and any
+ * transport parameters published by the frontend must contain
+ * their default values.
+ *
+ * o A backend may transition to XenbusStateInitialised, bypassing
+ * XenbusStateInitWait, without waiting for the frontend to first
+ * enter the XenbusStateInitialised state. In this case, default
+ * transport parameters are in effect and any transport parameters
+ * published by the backend must contain their default values.
+ *
+ * Drivers that support optional features and/or transport parameter
+ * negotiation must tolerate these additional state transition paths.
+ * In general this means performing the work of any skipped state
+ * transition, if it has not already been performed, in addition to the
+ * work associated with entry into the current state.
+ */
+
+/*
* REQUEST CODES.
*/
#define BLKIF_OP_READ 0
#define BLKIF_OP_WRITE 1
/*
- * Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature-barrier" node contains a boolean indicating whether barrier
- * requests are likely to succeed or fail. Either way, a barrier request
- * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
- * the underlying block-device hardware. The boolean simply indicates whether
- * or not it is worthwhile for the frontend to attempt barrier requests.
- * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
- * create the "feature-barrier" node!
+ * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER
+ * operation code ("barrier request") must be completed prior to the
+ * execution of the barrier request. All writes issued after the barrier
+ * request must not execute until after the completion of the barrier request.
+ *
+ * Optional. See "feature-barrier" XenBus node documentation above.
*/
#define BLKIF_OP_WRITE_BARRIER 2
/*
- * Recognised if "feature-flush-cache" is present in backend xenbus
- * info. A flush will ask the underlying storage hardware to flush its
- * non-volatile caches as appropriate. The "feature-flush-cache" node
- * contains a boolean indicating whether flush requests are likely to
- * succeed or fail. Either way, a flush request may fail at any time
- * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
- * block-device hardware. The boolean simply indicates whether or not it
- * is worthwhile for the frontend to attempt flushes. If a backend does
- * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
- * "feature-flush-cache" node!
+ * Commit any uncommitted contents of the backing device's volatile cache
+ * to stable storage.
+ *
+ * Optional. See "feature-flush-cache" XenBus node documentation above.
*/
#define BLKIF_OP_FLUSH_DISKCACHE 3
+/*
+ * Used in SLES sources for device specific command packet
+ * contained within the request. Reserved for that purpose.
+ */
+#define BLKIF_OP_RESERVED_1 4
+/*
+ * Indicate to the backend device that a region of storage is no longer in
+ * use, and may be discarded at any time without impact to the client. If
+ * the BLKIF_DISCARD_SECURE flag is set on the request, all copies of the
+ * discarded region on the device must be rendered unrecoverable before the
+ * command returns.
+ *
+ * This operation is analogous to performing a trim (ATA) or unmap (SCSI),
+ * command on a native device.
+ *
+ * More information about trim/unmap operations can be found at:
+ * http://t13.org/Documents/UploadedDocuments/docs2008/
+ * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
+ * http://www.seagate.com/staticfiles/support/disc/manuals/
+ * Interface%20manuals/100293068c.pdf
+ *
+ * Optional. See "feature-discard", "discard-alignment",
+ * "discard-granularity", and "discard-secure" in the XenBus node
+ * documentation above.
+ */
+#define BLKIF_OP_DISCARD 5
/*
* Maximum scatter/gather segments associated with a request header block.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK 11
@@ -92,6 +473,13 @@
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 255
+/*
+ * NB. first_sect and last_sect in blkif_request_segment, as well as
+ * sector_number in blkif_request, are always expressed in 512-byte units.
+ * However they must be properly aligned to the real sector size of the
+ * physical disk, which is reported in the "sector-size" node in the backend
+ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ */
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
/* @first_sect: first sector in frame to transfer (inclusive). */
@@ -100,16 +488,60 @@ struct blkif_request_segment {
};
typedef struct blkif_request_segment blkif_request_segment_t;
+/*
+ * Starting ring element for any I/O request.
+ *
+ * One or more segment blocks can be inserted into the request ring
+ * just after a blkif_request_t, allowing requests to operate on
+ * up to BLKIF_MAX_SEGMENTS_PER_REQUEST.
+ *
+ * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments
+ * to determine the number of contiguous ring entries associated
+ * with this request.
+ *
+ * Note: Due to the way Xen request rings operate, the producer and
+ * consumer indices of the ring must be incremented by the
+ * BLKIF_SEGS_TO_BLOCKS() value of the associated request.
+ * (e.g. a response to a 3 ring entry request must also consume
+ * 3 entries in the ring, even though only the first ring entry
+ * in the response has any data.)
+ */
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+ blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
};
typedef struct blkif_request blkif_request_t;
+/*
+ * A segment block is a ring request structure that contains only
+ * segment data.
+ *
+ * sizeof(struct blkif_segment_block) <= sizeof(struct blkif_request)
+ */
+struct blkif_segment_block {
+ blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK];
+};
+typedef struct blkif_segment_block blkif_segment_block_t;
+
+/*
+ * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD
+ * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
+ */
+struct blkif_request_discard {
+ uint8_t operation; /* BLKIF_OP_DISCARD */
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+#define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */
+ blkif_vdev_t handle; /* same as for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk */
+ uint64_t nr_sectors; /* number of contiguous sectors to discard*/
+};
+typedef struct blkif_request_discard blkif_request_discard_t;
+
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
@@ -130,24 +562,26 @@ typedef struct blkif_response blkif_response_t;
/*
* Generate blkif ring structures and types.
*/
-
DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
-#define BLKRING_GET_SG_REQUEST(_r, _idx) \
- ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx))
-
-#define VDISK_CDROM 0x1
-#define VDISK_REMOVABLE 0x2
-#define VDISK_READONLY 0x4
+/*
+ * Index to, and treat as a segment block, an entry in the ring.
+ */
+#define BLKRING_GET_SEG_BLOCK(_r, _idx) \
+ (((blkif_segment_block_t *)RING_GET_REQUEST(_r, _idx))->seg)
/*
* The number of ring request blocks required to handle an I/O
* request containing _segs segments.
*/
-#define BLKIF_SEGS_TO_BLOCKS(_segs) \
- ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \
- + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \
- / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+#define BLKIF_SEGS_TO_BLOCKS(_segs) \
+ ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \
+ + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \
+ / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+
+#define VDISK_CDROM 0x1
+#define VDISK_REMOVABLE 0x2
+#define VDISK_READONLY 0x4
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/sys/xen/xenbus/xenbusvar.h b/sys/xen/xenbus/xenbusvar.h
index bf2a342b546f..1c730fb900a5 100644
--- a/sys/xen/xenbus/xenbusvar.h
+++ b/sys/xen/xenbus/xenbusvar.h
@@ -104,6 +104,20 @@ XENBUS_ACCESSOR(otherend_path, OTHEREND_PATH, const char *)
XenbusState xenbus_read_driver_state(const char *path);
/**
+ * Return the state of the "other end" (peer) of a XenBus device.
+ *
+ * \param dev The XenBus device whose peer to query.
+ *
+ * \return The current state of the peer device or XenbusStateClosed if no
+ * state can be read.
+ */
+static inline XenbusState
+xenbus_get_otherend_state(device_t dev)
+{
+ return (xenbus_read_driver_state(xenbus_get_otherend_path(dev)));
+}
+
+/**
* Initialize and register a watch on the given path (client suplied storage).
*
* \param dev The XenBus device requesting the watch service.