aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-11-21 14:00:09 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-11-21 14:00:09 +0000
commite86bc27866b0b05c05a6c6baa2751ced4e4f4a92 (patch)
tree062d39a1e0c91ec5f3a7e0f422ef056b512e4bac
parenta15de8f95aabc1f35a16ab8db62c7ebe62f0c8fe (diff)
downloadsrc-e86bc27866b0b05c05a6c6baa2751ced4e4f4a92.tar.gz
src-e86bc27866b0b05c05a6c6baa2751ced4e4f4a92.zip
10566 Multiple DVA Scrubbing Fix
illumos/illumos-gate@12a8814c13fbb1d6d58616cf090ea5815dc107f9 https://github.com/illumos/illumos-gate/commit/12a8814c13fbb1d6d58616cf090ea5815dc107f9 https://www.illumos.org/issues/10566 ZoL PR_8453 Author: Tom Caputi <tcaputi@datto.com> Date: Fri Mar 15 17:14:31 2019 -0400 Multiple DVA Scrubbing Fix Currently, there is an issue in the sequential scrub code which prevents self healing from working in some cases. The scrub code will split up all DVA copies of a bp and issue each of them separately. The problem is that, since each of the DVAs is no longer associated with the others, the self healing code doesn't have the opportunity to repair problems that show up in one of the DVAs with the data from the others. This patch fixes this issue by ensuring that all IOs issued by the sequential scrub code include all DVAs. Initially, only the first DVA of each is attempted. If an issue arises, the IO is retried with all available copies, giving the self healing code a chance to correct the issue. To test this change, this patch also adds the ability for zinject to specify individual DVAs to inject read errors into. We then add a new test case that utilizes this functionality to ensure scrubs and self-healing reads can handle and transparently fix issues with individual copies of blocks. This update is followup on #10405 While attempting to port this update, the following ZoL updates are included: 551905dd4 vdev_mirror: kstat observables for preferred vdev d6c6590c5 vdev_mirror: load balancing fixes 9f500936c FreeBSD r256956: Improve ZFS N-way mirror read performance by Portions contributed by: Toomas Soome <tsoome@me.com> Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Author: Tom Caputi <tcaputi@datto.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=354954
-rw-r--r--cmd/zinject/zinject.c176
-rw-r--r--uts/common/fs/zfs/dsl_scan.c199
-rw-r--r--uts/common/fs/zfs/metaslab.c2
-rw-r--r--uts/common/fs/zfs/spa_misc.c2
-rw-r--r--uts/common/fs/zfs/sys/spa.h4
-rw-r--r--uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--uts/common/fs/zfs/sys/zfs_ioctl.h4
-rw-r--r--uts/common/fs/zfs/vdev.c22
-rw-r--r--uts/common/fs/zfs/vdev_disk.c10
-rw-r--r--uts/common/fs/zfs/vdev_file.c3
-rw-r--r--uts/common/fs/zfs/vdev_mirror.c365
-rw-r--r--uts/common/fs/zfs/vdev_queue.c33
-rw-r--r--uts/common/fs/zfs/zio_inject.c42
14 files changed, 682 insertions, 184 deletions
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index efae04675e38..1c0b3199bd98 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -47,48 +47,48 @@
*
* This form of the command looks like:
*
- * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
+ * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
*
*
* DATA FAULTS
*
* We begin with a tuple of the form:
*
- * <type,level,range,object>
+ * <type,level,range,object>
*
- * type A string describing the type of data to target. Each type
- * implicitly describes how to interpret 'object'. Currently,
- * the following values are supported:
+ * type A string describing the type of data to target. Each type
+ * implicitly describes how to interpret 'object'. Currently,
+ * the following values are supported:
*
- * data User data for a file
- * dnode Dnode for a file or directory
+ * data User data for a file
+ * dnode Dnode for a file or directory
*
* The following MOS objects are special. Instead of injecting
* errors on a particular object or blkid, we inject errors across
* all objects of the given type.
*
- * mos Any data in the MOS
- * mosdir object directory
- * config pool configuration
- * bpobj blkptr list
- * spacemap spacemap
- * metaslab metaslab
- * errlog persistent error log
+ * mos Any data in the MOS
+ * mosdir object directory
+ * config pool configuration
+ * bpobj blkptr list
+ * spacemap spacemap
+ * metaslab metaslab
+ * errlog persistent error log
*
- * level Object level. Defaults to '0', not applicable to all types. If
- * a range is given, this corresponds to the indirect block
- * corresponding to the specific range.
+ * level Object level. Defaults to '0', not applicable to all types. If
+ * a range is given, this corresponds to the indirect block
+ * corresponding to the specific range.
*
* range A numerical range [start,end) within the object. Defaults to
* the full size of the file.
*
- * object A string describing the logical location of the object. For
- * files and directories (currently the only supported types),
- * this is the path of the object on disk.
+ * object A string describing the logical location of the object. For
+ * files and directories (currently the only supported types),
+ * this is the path of the object on disk.
*
* This is translated, via libzpool, into the following internal representation:
*
- * <type,objset,object,level,range>
+ * <type,objset,object,level,range>
*
* These types should be self-explanatory. This tuple is then passed to the
* kernel via a special ioctl() to initiate fault injection for the given
@@ -98,12 +98,12 @@
*
* The command itself takes one of the forms:
*
- * zinject
- * zinject <-a | -u pool>
- * zinject -c <id|all>
- * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
+ * zinject
+ * zinject <-a | -u pool>
+ * zinject -c <id|all>
+ * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
* [-r range] <object>
- * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
+ * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
*
* With no arguments, the command prints all currently registered injection
* handlers, with their numeric identifiers.
@@ -288,8 +288,8 @@ usage(void)
"\t\tspecified by the remaining tuple. Each number is in\n"
"\t\thexidecimal, and only one block can be specified.\n"
"\n"
- "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
- "\t [-a] [-m] [-u] [-f freq] <object>\n"
+ "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
+ "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
"\n"
"\t\tInject an error into the object specified by the '-t' option\n"
"\t\tand the object descriptor. The 'object' parameter is\n"
@@ -297,7 +297,10 @@ usage(void)
"\n"
"\t\t-q\tQuiet mode. Only print out the handler number added.\n"
"\t\t-e\tInject a specific error. Must be either 'io' or\n"
- "\t\t\t'checksum'. Default is 'io'.\n"
+ "\t\t\t'checksum', or 'decompress'. Default is 'io'.\n"
+ "\t\t-C\tInject the given error only into specific DVAs. The\n"
+ "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
+ "\t\t\tseparated by commas (ex. '0,2').\n"
"\t\t-l\tInject error at a particular block level. Default is "
"0.\n"
"\t\t-m\tAutomatically remount underlying filesystem.\n"
@@ -358,17 +361,19 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
return (0);
if (*count == 0) {
- (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n",
- "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE");
+ (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s ",
+ "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
+ "LVL", "DVAs", "RANGE");
(void) printf("--- --------------- ------ "
- "------ -------- --- ---------------\n");
+ "------ -------- --- ---- ----------------\n");
}
*count += 1;
- (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool,
- (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
- type_to_name(record->zi_type), record->zi_level);
+ (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ",
+ id, pool, (u_longlong_t)record->zi_objset,
+ (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
+ record->zi_level, record->zi_dvas);
if (record->zi_start == 0 &&
record->zi_end == -1ULL)
@@ -598,6 +603,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
(void) printf(" range: [%llu, %llu)\n",
(u_longlong_t)record->zi_start,
(u_longlong_t)record->zi_end);
+ (void) printf(" dvas: 0x%x\n", record->zi_dvas);
}
}
@@ -649,6 +655,59 @@ parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
return (0);
}
+/*
+ * This function converts a string specifier for DVAs into a bit mask.
+ * The dva's provided by the user should be 0 indexed and separated by
+ * a comma. For example:
+ * "1" -> 0b0010 (0x2)
+ * "0,1" -> 0b0011 (0x3)
+ * "0,1,2" -> 0b0111 (0x7)
+ */
+static int
+parse_dvas(const char *str, uint32_t *dvas_out)
+{
+ const char *c = str;
+ uint32_t mask = 0;
+ boolean_t need_delim = B_FALSE;
+
+ /* max string length is 5 ("0,1,2") */
+ if (strlen(str) > 5 || strlen(str) == 0)
+ return (EINVAL);
+
+ while (*c != '\0') {
+ switch (*c) {
+ case '0':
+ case '1':
+ case '2':
+ /* check for pipe between DVAs */
+ if (need_delim)
+ return (EINVAL);
+
+ /* check if this DVA has been set already */
+ if (mask & (1 << ((*c) - '0')))
+ return (EINVAL);
+
+ mask |= (1 << ((*c) - '0'));
+ need_delim = B_TRUE;
+ break;
+ case ',':
+ need_delim = B_FALSE;
+ break;
+ default:
+ /* check for invalid character */
+ return (EINVAL);
+ }
+ c++;
+ }
+
+ /* check for dangling delimiter */
+ if (!need_delim)
+ return (EINVAL);
+
+ *dvas_out = mask;
+ return (0);
+}
+
int
main(int argc, char **argv)
{
@@ -675,6 +734,7 @@ main(int argc, char **argv)
int dur_secs = 0;
int ret;
int flags = 0;
+ uint32_t dvas = 0;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "internal error: failed to "
@@ -705,7 +765,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
- ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+ ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -728,6 +788,17 @@ main(int argc, char **argv)
case 'c':
cancel = optarg;
break;
+ case 'C':
+ ret = parse_dvas(optarg, &dvas);
+ if (ret != 0) {
+ (void) fprintf(stderr, "invalid DVA list '%s': "
+ "DVAs should be 0 indexed and separated by "
+ "commas.\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
case 'd':
device = optarg;
break;
@@ -887,7 +958,8 @@ main(int argc, char **argv)
* '-c' is invalid with any other options.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -919,7 +991,8 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ dvas != 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -953,7 +1026,8 @@ main(int argc, char **argv)
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
- record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@@ -983,7 +1057,8 @@ main(int argc, char **argv)
error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || device != NULL) {
+ level != 0 || device != NULL || record.zi_freq > 0 ||
+ dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
usage();
@@ -1002,6 +1077,15 @@ main(int argc, char **argv)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "hardware failure (-I) "
+ "incompatible with other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n");
@@ -1055,6 +1139,18 @@ main(int argc, char **argv)
return (1);
}
+ if (dvas != 0) {
+ if (error == EACCES || error == EINVAL) {
+ (void) fprintf(stderr, "the '-C' option may "
+ "not be used with logical data errors "
+ "'decrypt' and 'decompress'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_dvas = dvas;
+ }
+
record.zi_cmd = ZINJECT_DATA_FAULT;
if (translate_record(type, argv[0], range, level, &record, pool,
dataset) != 0)
diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c
index 00bd1498a2fd..ca82195178d7 100644
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@@ -249,24 +249,43 @@ typedef enum {
*/
typedef struct scan_io {
/* fields from blkptr_t */
- uint64_t sio_offset;
uint64_t sio_blk_prop;
uint64_t sio_phys_birth;
uint64_t sio_birth;
zio_cksum_t sio_cksum;
- uint32_t sio_asize;
+ uint32_t sio_nr_dvas;
/* fields from zio_t */
- int sio_flags;
+ uint32_t sio_flags;
zbookmark_phys_t sio_zb;
/* members for queue sorting */
union {
- avl_node_t sio_addr_node; /* link into issueing queue */
+ avl_node_t sio_addr_node; /* link into issuing queue */
list_node_t sio_list_node; /* link for issuing to disk */
} sio_nodes;
+
+ /*
+ * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+ * depending on how many were in the original bp. Only the
+ * first DVA is really used for sorting and issuing purposes.
+ * The other DVAs (if provided) simply exist so that the zio
+ * layer can find additional copies to repair from in the
+ * event of an error. This array must go at the end of the
+ * struct to allow this for the variable number of elements.
+ */
+ dva_t sio_dva[0];
} scan_io_t;
+#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define SIO_GET_END_OFFSET(sio) \
+ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define SIO_GET_MUSED(sio) \
+ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
@@ -275,6 +294,7 @@ struct dsl_scan_io_queue {
range_tree_t *q_exts_by_addr;
avl_tree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
+ uint64_t q_sio_memused;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
@@ -313,7 +333,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
static void scan_io_queues_destroy(dsl_scan_t *scn);
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+ ASSERT3U(nr_dvas, >, 0);
+ ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
void
scan_init(void)
@@ -328,14 +368,22 @@ scan_init(void)
*/
fill_weight = zfs_scan_fill_weight;
- sio_cache = kmem_cache_create("sio_cache",
- sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ char name[36];
+
+ (void) sprintf(name, "sio_cache_%d", i);
+ sio_cache[i] = kmem_cache_create(name,
+ (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
}
void
scan_fini(void)
{
- kmem_cache_destroy(sio_cache);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ kmem_cache_destroy(sio_cache[i]);
+ }
}
static inline boolean_t
@@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
}
static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
bzero(bp, sizeof (*bp));
- DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
- DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
- DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
+
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
- /* we discard the vdev id, since we can deduce it from the queue */
- sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
- sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_cksum = bp->blk_cksum;
+ sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+ /*
+ * Copy the DVAs to the sio. We need all copies of the block so
+ * that the self healing code can use the alternate copies if the
+ * first is corrupted. We want the DVA at index dva_i to be first
+ * in the sio since this is the primary one that we want to issue.
+ */
+ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+ sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+ }
}
int
@@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* #extents in exts_by_size = # in exts_by_addr */
+ /* # extents in exts_by_size = # in exts_by_addr */
mused += avl_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_t) +
- avl_numnodes(&queue->q_sios_by_addr) *
- sizeof (scan_io_t);
+ sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
break;
}
- sio2bp(sio, &bp, queue->q_vd->vdev_id);
- bytes_issued += sio->sio_asize;
+ sio2bp(sio, &bp);
+ bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
{
- scan_io_t srch_sio, *sio, *next_sio;
+ scan_io_t *srch_sio, *sio, *next_sio;
avl_index_t idx;
uint_t num_sios = 0;
int64_t bytes_issued = 0;
@@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
ASSERT(rs != NULL);
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
- srch_sio.sio_offset = rs->rs_start;
+ srch_sio = sio_alloc(1);
+ srch_sio->sio_nr_dvas = 1;
+ SIO_SET_OFFSET(srch_sio, rs->rs_start);
/*
* The exact start of the extent might not contain any matching zios,
* so if that's the case, examine the next one in the tree.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
- while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
- ASSERT3U(sio->sio_offset, >=, rs->rs_start);
- ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+ while (sio != NULL &&
+ SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
- bytes_issued += sio->sio_asize;
+ bytes_issued += SIO_GET_ASIZE(sio);
num_sios++;
list_insert_tail(list, sio);
sio = next_sio;
@@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
- if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
- sio->sio_offset, rs->rs_end - sio->sio_offset);
+ SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
@@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg)
first_sio = list_head(&sio_list);
last_sio = list_tail(&sio_list);
- seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ seg_end = SIO_GET_END_OFFSET(last_sio);
if (seg_start == 0)
- seg_start = first_sio->sio_offset;
+ seg_start = SIO_GET_OFFSET(first_sio);
/*
* Issuing sios can take a long time so drop the
@@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
- /* update the spa's stats on how many bytes we have issued */
- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ /*
+ * Update the spa's stats on how many bytes we have issued.
+ * Sequential scrubs create a zio for each DVA of the bp. Each
+ * of these will include all DVAs for repair purposes, but the
+ * zio code will only try the first one unless there is an issue.
+ * Therefore, we should only count the first DVA for these IOs.
+ */
+ if (scn->scn_is_sorted) {
atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[i]));
+ DVA_GET_ASIZE(&bp->blk_dva[0]));
+ } else {
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
}
/*
@@ -3426,7 +3501,7 @@ static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
- range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+ queue->q_sio_memused += SIO_GET_MUSED(sio);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
}
/*
@@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = queue->q_scn;
- scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+ scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
- atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio);
}
@@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y)
* based on LBA-order (from lowest to highest).
*/
static int
-io_addr_compare(const void *x, const void *y)
+sio_addr_compare(const void *x, const void *y)
{
const scan_io_t *a = x, *b = y;
- if (a->sio_offset < b->sio_offset)
- return (-1);
- if (a->sio_offset == b->sio_offset)
- return (0);
- return (1);
+ return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
@@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn;
q->q_vd = vd;
+ q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
&q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
- avl_create(&q->q_sios_by_addr, io_addr_compare,
+ avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
return (q);
@@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
- sio->sio_offset, sio->sio_asize));
- bytes_dequeued += sio->sio_asize;
- kmem_free(sio, sizeof (*sio));
+ SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+ bytes_dequeued += SIO_GET_ASIZE(sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+ sio_free(sio);
}
+ ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
@@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
vdev_t *vdev;
kmutex_t *q_lock;
dsl_scan_io_queue_t *queue;
- scan_io_t srch, *sio;
+ scan_io_t *srch_sio, *sio;
avl_index_t idx;
uint64_t start, size;
@@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
return;
}
- bp2sio(bp, &srch, dva_i);
- start = srch.sio_offset;
- size = srch.sio_asize;
+ srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+ bp2sio(bp, srch_sio, dva_i);
+ start = SIO_GET_OFFSET(srch_sio);
+ size = SIO_GET_ASIZE(srch_sio);
/*
* We can find the zio in two states:
@@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
* be done with issuing the zio's it gathered and will
* signal us.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio != NULL) {
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
- ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */
- sio2bp(sio, &tmpbp, dva_i);
+ sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp);
- kmem_free(sio, sizeof (*sio));
+ sio_free(sio);
}
mutex_exit(q_lock);
}
diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c
index 1c004f87f31a..d0b9f6960f2a 100644
--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp)
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
- if (metaslab_lba_weighting_enabled) {
+ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
ASSERT(weight >= space && weight <= 2 * space);
}
diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c
index 7a44ac86b003..0a5cec2644b2 100644
--- a/uts/common/fs/zfs/spa_misc.c
+++ b/uts/common/fs/zfs/spa_misc.c
@@ -2036,6 +2036,7 @@ spa_init(int mode)
dmu_init();
zil_init();
vdev_cache_stat_init();
+ vdev_mirror_stat_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
@@ -2052,6 +2053,7 @@ spa_fini(void)
spa_evict_all();
vdev_cache_stat_fini();
+ vdev_mirror_stat_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h
index 4ff552447e2c..53b9e4ef5d6b 100644
--- a/uts/common/fs/zfs/sys/spa.h
+++ b/uts/common/fs/zfs/sys/spa.h
@@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
extern void vdev_cache_stat_init(void);
extern void vdev_cache_stat_fini(void);
+/* vdev mirror */
+extern void vdev_mirror_stat_init(void);
+extern void vdev_mirror_stat_fini(void);
+
/* Initialization and termination */
extern void spa_init(int flags);
extern void spa_fini(void);
diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h
index 0c0bc874c106..e21989641b91 100644
--- a/uts/common/fs/zfs/sys/vdev.h
+++ b/uts/common/fs/zfs/sys/vdev.h
@@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h
index 4e1b09c27dda..a91927dbb6a5 100644
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@@ -225,6 +225,7 @@ struct vdev {
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
+ boolean_t vdev_nonrot; /* true if solid state */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
uint64_t vdev_crtxg; /* txg when top-level was added */
diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h
index 824d1d8bb70f..70916c45b7e5 100644
--- a/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -318,13 +318,15 @@ typedef struct zinject_record {
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
- uint32_t zi_pad;
+ uint32_t zi_dvas;
} zinject_record_t;
#define ZINJECT_NULL 0x1
#define ZINJECT_FLUSH_ARC 0x2
#define ZINJECT_UNLOAD_SPA 0x4
+#define ZI_NO_DVA (-1)
+
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 38e366cc34a9..7ca3429b46de 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -1479,19 +1479,27 @@ vdev_open_children(vdev_t *vd)
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
+retry_sync:
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
- return;
+ } else {
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ if (tq == NULL)
+ goto retry_sync;
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child,
+ vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+
+ taskq_destroy(tq);
}
- tq = taskq_create("vdev_open", children, minclsyspri,
- children, children, TASKQ_PREPOPULATE);
- for (int c = 0; c < children; c++)
- VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
- TQ_SLEEP) != TASKQID_INVALID);
+ vd->vdev_nonrot = B_TRUE;
- taskq_destroy(tq);
+ for (int c = 0; c < children; c++)
+ vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
}
/*
diff --git a/uts/common/fs/zfs/vdev_disk.c b/uts/common/fs/zfs/vdev_disk.c
index 93462ee2ba53..3f137c5d59c5 100644
--- a/uts/common/fs/zfs/vdev_disk.c
+++ b/uts/common/fs/zfs/vdev_disk.c
@@ -606,6 +606,16 @@ skip_open:
*/
vd->vdev_nowritecache = B_FALSE;
+ /* Inform the ZIO pipeline that we are non-rotational */
+ vd->vdev_nonrot = B_FALSE;
+ if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state")) {
+ if (ldi_prop_get_int(dvd->vd_lh,
+ LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+ "device-solid-state", B_FALSE) != 0)
+ vd->vdev_nonrot = B_TRUE;
+ }
+
return (0);
}
diff --git a/uts/common/fs/zfs/vdev_file.c b/uts/common/fs/zfs/vdev_file.c
index 3aaebe8505cd..806716200a6b 100644
--- a/uts/common/fs/zfs/vdev_file.c
+++ b/uts/common/fs/zfs/vdev_file.c
@@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
vattr_t vattr;
int error;
+ /* Rotational optimizations only make sense on block devices */
+ vd->vdev_nonrot = B_TRUE;
+
/*
* We must have a pathname, and it must be absolute.
*/
diff --git a/uts/common/fs/zfs/vdev_mirror.c b/uts/common/fs/zfs/vdev_mirror.c
index f489bb196774..f654bf9afb16 100644
--- a/uts/common/fs/zfs/vdev_mirror.c
+++ b/uts/common/fs/zfs/vdev_mirror.c
@@ -38,6 +38,65 @@
#include <sys/fs/zfs.h>
/*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+ kstat_named_t vdev_mirror_stat_rotating_linear;
+ kstat_named_t vdev_mirror_stat_rotating_offset;
+ kstat_named_t vdev_mirror_stat_rotating_seek;
+ kstat_named_t vdev_mirror_stat_non_rotating_linear;
+ kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+ kstat_named_t vdev_mirror_stat_preferred_found;
+ kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+ /* New I/O follows directly the last I/O */
+ { "rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+ { "rotating_offset", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek */
+ { "rotating_seek", KSTAT_DATA_UINT64 },
+ /* New I/O follows directly the last I/O (nonrot) */
+ { "non_rotating_linear", KSTAT_DATA_UINT64 },
+ /* New I/O requires random seek (nonrot) */
+ { "non_rotating_seek", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev found */
+ { "preferred_found", KSTAT_DATA_UINT64 },
+ /* Preferred child vdev not found or equal load */
+ { "preferred_not_found", KSTAT_DATA_UINT64 },
+
+};
+
+#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
+#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
+#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+ mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+ "misc", KSTAT_TYPE_NAMED,
+ sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (mirror_ksp != NULL) {
+ mirror_ksp->ks_data = &mirror_stats;
+ kstat_install(mirror_ksp);
+ }
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+ if (mirror_ksp != NULL) {
+ kstat_delete(mirror_ksp);
+ mirror_ksp = NULL;
+ }
+}
+
+/*
* Virtual device vector for mirroring.
*/
@@ -45,48 +104,182 @@ typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
+ int mc_load;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
} mirror_child_t;
typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
int mm_children;
int mm_resilvering;
- int mm_preferred;
int mm_root;
- mirror_child_t mm_child[1];
+ mirror_child_t mm_child[];
} mirror_map_t;
int vdev_mirror_shift = 21;
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return (mm);
+}
+
static void
vdev_mirror_map_free(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
}
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
- vdev_mirror_map_free,
- zio_vsd_default_cksum_report
+ .vsd_free = vdev_mirror_map_free,
+ .vsd_cksum_report = zio_vsd_default_cksum_report
};
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t last_offset;
+ int64_t offset_diff;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Fix zio_offset for leaf vdevs */
+ if (vd->vdev_ops->vdev_op_leaf)
+ zio_offset += VDEV_LABEL_START_SIZE;
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ last_offset = vdev_queue_last_offset(vd);
+
+ if (vd->vdev_nonrot) {
+ /* Non-rotating media. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+ return (load + zfs_vdev_mirror_non_rotating_inc);
+ }
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O's can be aggregated into fewer operations on
+ * the device, thus avoiding unnecessary per-command overhead
+ * and boosting performance.
+ */
+ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+ return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (last_offset == zio_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+ return (load + zfs_vdev_mirror_rotating_inc);
+ }
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O issued to this vdev as they should incur less
+ * of a seek increment.
+ */
+ offset_diff = (int64_t)(last_offset - zio_offset);
+ if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+ MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+ return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+ }
+
+ /* Apply the full seek increment to all other I/O's. */
+ MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+ return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
{
mirror_map_t *mm = NULL;
mirror_child_t *mc;
vdev_t *vd = zio->io_vd;
- int c, d;
+ int c;
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
+ dsl_scan_t *scn = NULL;
dva_t dva_copy[SPA_DVAS_PER_BP];
- c = BP_GET_NDVAS(zio->io_bp);
+ if (spa->spa_dsl_pool != NULL) {
+ scn = spa->spa_dsl_pool->dp_scan;
+ }
+ /*
+ * The sequential scrub code sorts and issues all DVAs
+ * of a bp separately. Each of these IOs includes all
+ * original DVA copies so that repairs can be performed
+ * in the event of an error, but we only actually want
+ * to check the first DVA since the others will be
+ * checked by their respective sorted IOs. Only if we
+ * hit an error will we try all DVAs upon retrying.
+ *
+ * Note: This check is safe even if the user switches
+ * from a legacy scrub to a sequential one in the middle
+ * of processing, since scn_is_sorted isn't updated until
+ * all outstanding IOs from the previous scrub pass
+ * complete.
+ */
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+ scn != NULL &&
+ scn->scn_is_sorted &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool)) {
+ c = 1;
+ } else {
+ c = BP_GET_NDVAS(zio->io_bp);
+ }
/*
* If we do not trust the pool config, some DVAs might be
@@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio)
}
}
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_resilvering = B_FALSE;
- mm->mm_preferred = spa_get_random(c);
- mm->mm_root = B_TRUE;
-
- /*
- * Check the other, lower-index DVAs to see if they're on
- * the same vdev as the child we picked. If they are, use
- * them since they are likely to have been allocated from
- * the primary metaslab in use at the time, and hence are
- * more likely to have locality with single-copy data.
- */
- for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
- if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
- mm->mm_preferred = d;
- }
-
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio)
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
- int replacing;
-
- c = vd->vdev_children;
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
/*
* If we are resilvering, then we should handle scrub reads
* differently; we shouldn't issue them to the resilvering
@@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio)
* automatically removed from the pool after the user replaces
* the device that originally failed.
*/
- replacing = (vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- /*
- * If a spa load is in progress, then spa_dsl_pool may be
- * uninitialized. But we shouldn't be resilvering during a spa
- * load anyway.
- */
- if (replacing &&
- (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
- dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
- mm->mm_resilvering = B_TRUE;
- } else {
- mm->mm_resilvering = B_FALSE;
- }
-
- mm->mm_preferred = mm->mm_resilvering ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
- mm->mm_root = B_FALSE;
-
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
@@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio)
}
mutex_exit(&zio->io_lock);
}
+
abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
@@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio)
}
/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p--; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
*/
@@ -284,43 +490,64 @@ static int
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
uint64_t txg = zio->io_txg;
- int i, c;
+ int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
- /*
- * Try to find a child whose DTL doesn't contain the block to read.
- * If a child is known to be completely inaccessible (indicated by
- * vdev_readable() returning B_FALSE), don't even try.
- */
- for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
- if (c >= mm->mm_children)
- c = 0;
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (!vdev_readable(mc->mc_vd)) {
+
+ if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
- if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
- return (c);
- mc->mc_error = SET_ERROR(ESTALE);
- mc->mc_skipped = 1;
- mc->mc_speculative = 1;
+
+ if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
+ }
+
+ if (mm->mm_preferred_cnt == 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+ return (vdev_mirror_preferred_child_randomize(zio));
}
/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
*/
- for (c = 0; c < mm->mm_children; c++)
+ for (c = 0; c < mm->mm_children; c++) {
if (!mm->mm_child[c].mc_tried)
return (c);
+ }
/*
* Every child failed. There's no place left to look.
@@ -335,7 +562,7 @@ vdev_mirror_io_start(zio_t *zio)
mirror_child_t *mc;
int c, children;
- mm = vdev_mirror_map_alloc(zio);
+ mm = vdev_mirror_map_init(zio);
if (mm == NULL) {
ASSERT(!spa_trust_config(zio->io_spa));
diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c
index 0643c05f573d..a89e06ebbf60 100644
--- a/uts/common/fs/zfs/vdev_queue.c
+++ b/uts/common/fs/zfs/vdev_queue.c
@@ -276,6 +276,8 @@ vdev_queue_init(vdev_t *vd)
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+
+ vq->vq_last_offset = 0;
}
void
@@ -701,7 +703,7 @@ again:
*/
tree = vdev_queue_class_tree(vq, p);
search.io_timestamp = 0;
- search.io_offset = vq->vq_last_offset + 1;
+ search.io_offset = vq->vq_last_offset - 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
@@ -729,7 +731,7 @@ again:
}
vdev_queue_pending_add(vq, zio);
- vq->vq_last_offset = zio->io_offset;
+ vq->vq_last_offset = zio->io_offset + zio->io_size;
return (zio);
}
@@ -849,12 +851,39 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
+ spa_t *spa = zio->io_spa;
+ zio_priority_t oldpri = zio->io_priority;
+
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
+ spa->spa_queue_stats[oldpri].spa_queued--;
+ spa->spa_queue_stats[zio->io_priority].spa_queued++;
+ mutex_exit(&spa->spa_iokstat_lock);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
}
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_last_offset);
+}
diff --git a/uts/common/fs/zfs/zio_inject.c b/uts/common/fs/zfs/zio_inject.c
index 26f59af9968f..71b859bc3dfd 100644
--- a/uts/common/fs/zfs/zio_inject.c
+++ b/uts/common/fs/zfs/zio_inject.c
@@ -102,7 +102,7 @@ static int inject_next_id = 1;
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
zinject_record_t *record, int error)
{
/*
@@ -127,9 +127,11 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
- error == record->zi_error)
+ (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ error == record->zi_error) {
return (record->zi_freq == 0 ||
spa_get_random(100) < record->zi_freq);
+ }
return (B_FALSE);
}
@@ -159,6 +161,38 @@ zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
rw_exit(&inject_lock);
}
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+ int i = ZI_NO_DVA;
+
+ if (zio->io_bp != NULL && zio->io_vd != NULL &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+ dva_t *dva = &zio->io_bp->blk_dva[i];
+ uint64_t off = DVA_GET_OFFSET(dva);
+ vdev_t *vd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(dva));
+
+ /* Compensate for vdev label added to leaves */
+ if (zio->io_vd->vdev_ops->vdev_op_leaf)
+ off += VDEV_LABEL_START_SIZE;
+
+ if (zio->io_vd == vd && zio->io_offset == off)
+ break;
+ }
+ }
+
+ return (i);
+}
+
+
/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
@@ -190,10 +224,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
- /* If this handler matches, return EIO */
+ /* If this handler matches, return the specified error */
if (zio_match_handler(&zio->io_logical->io_bookmark,
zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
- &handler->zi_record, error)) {
+ zio_match_dva(zio), &handler->zi_record, error)) {
ret = error;
break;
}