aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-10-15 14:24:32 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-10-15 14:24:32 +0000
commita4a9c977b9efb8f606c662e31d221af7b15c2a2d (patch)
tree6dabdf92986781c38d6fbef4de2ee5e0ac108d62
parentf21865af24bc49220343581d5261ef9d9c641f69 (diff)
downloadsrc-a4a9c977b9efb8f606c662e31d221af7b15c2a2d.tar.gz
src-a4a9c977b9efb8f606c662e31d221af7b15c2a2d.zip
10572 10579 Fix race in dnode_check_slots_free()
illumos/illumos-gate@aa02ea01948372a32cbf08bfc31c72c32e3fc81e https://github.com/illumos/illumos-gate/commit/aa02ea01948372a32cbf08bfc31c72c32e3fc81e 10572 Fix race in dnode_check_slots_free() https://www.illumos.org/issues/10572 The Fix from ZoL: Currently, dnode_check_slots_free() works by checking dn->dn_type in the dnode to determine if the dnode is reclaimable. However, there is a small window of time between dnode_free_sync() in the first call to dsl_dataset_sync() and when the useraccounting code is run when the type is set DMU_OT_NONE, but the dnode is not yet evictable, leading to crashes. This patch adds the ability for dnodes to track which txg they were last dirtied in and adds a check for this before performing the reclaim. This patch also corrects several instances when dn_dirty_link was treated as a list_node_t when it is technically a multilist_node_t. 10579 Don't allow dnode allocation if dn_holds != 0 https://www.illumos.org/issues/10579 The fix from ZoL: This patch simply fixes a small bug where dnode_hold_impl() could attempt to allocate a dnode that was in the process of being freed, but which still had active references. This patch simply adds the required check. Author: Tom Caputi <tcaputi@datto.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=353558
-rw-r--r--uts/common/fs/zfs/dbuf.c3
-rw-r--r--uts/common/fs/zfs/dmu_objset.c15
-rw-r--r--uts/common/fs/zfs/dnode.c30
-rw-r--r--uts/common/fs/zfs/sys/dmu_impl.h2
-rw-r--r--uts/common/fs/zfs/sys/dnode.h4
5 files changed, 43 insertions, 11 deletions
diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
index 4bb53837716c..c5ff59f78a18 100644
--- a/uts/common/fs/zfs/dbuf.c
+++ b/uts/common/fs/zfs/dbuf.c
@@ -1570,6 +1570,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
FTAG);
}
}
+
+ if (tx->tx_txg > dn->dn_dirty_txg)
+ dn->dn_dirty_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
if (db->db_blkid == DMU_SPILL_BLKID)
diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c
index db0fff702ef8..36e6391d5718 100644
--- a/uts/common/fs/zfs/dmu_objset.c
+++ b/uts/common/fs/zfs/dmu_objset.c
@@ -1247,10 +1247,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
multilist_sublist_remove(list, dn);
+ /*
+ * If we are not doing useraccounting (os_synced_dnodes == NULL)
+ * we are done with this dnode for this txg. Unset dn_dirty_txg
+ * if later txgs aren't dirtying it so that future holders do
+ * not get a stale value. Otherwise, we will do this in
+ * userquota_updates_task() when processing has completely
+ * finished for this txg.
+ */
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
if (newlist != NULL) {
(void) dnode_add_ref(dn, newlist);
multilist_insert(newlist, dn);
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_dirty_txg == tx->tx_txg)
+ dn->dn_dirty_txg = 0;
+ mutex_exit(&dn->dn_mtx);
}
dnode_sync(dn, tx);
@@ -1610,6 +1623,8 @@ userquota_updates_task(void *arg)
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
}
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+ if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
+ dn->dn_dirty_txg = 0;
mutex_exit(&dn->dn_mtx);
multilist_sublist_remove(list, dn);
diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c
index 6c1d91ddfa98..7e9b18d145a0 100644
--- a/uts/common/fs/zfs/dnode.c
+++ b/uts/common/fs/zfs/dnode.c
@@ -150,7 +150,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
- list_link_init(&dn->dn_dirty_link[i]);
+ multilist_link_init(&dn->dn_dirty_link[i]);
dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
@@ -160,6 +160,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
dn->dn_dirtyctx = 0;
dn->dn_dirtyctx_firstset = NULL;
dn->dn_bonus = NULL;
@@ -197,7 +198,7 @@ dnode_dest(void *arg, void *unused)
ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
@@ -212,6 +213,7 @@ dnode_dest(void *arg, void *unused)
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_free_txg);
ASSERT0(dn->dn_assigned_txg);
+ ASSERT0(dn->dn_dirty_txg);
ASSERT0(dn->dn_dirtyctx);
ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
ASSERT3P(dn->dn_bonus, ==, NULL);
@@ -543,6 +545,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
+ dn->dn_dirty_txg = 0;
dn->dn_dirtyctx = 0;
if (dn->dn_dirtyctx_firstset != NULL) {
@@ -612,6 +615,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_dirty_txg);
ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
@@ -625,7 +629,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_next_bonustype[i]);
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_blksz[i]);
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}
@@ -802,6 +806,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirty_txg = odn->dn_dirty_txg;
ndn->dn_dirtyctx = odn->dn_dirtyctx;
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
@@ -868,6 +873,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
odn->dn_allocated_txg = 0;
odn->dn_free_txg = 0;
odn->dn_assigned_txg = 0;
+ odn->dn_dirty_txg = 0;
odn->dn_dirtyctx = 0;
odn->dn_dirtyctx_firstset = NULL;
odn->dn_have_spill = B_FALSE;
@@ -1092,6 +1098,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
{
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+ /*
+ * If all dnode slots are either already free or
+ * evictable return B_TRUE.
+ */
for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
dnode_t *dn = dnh->dnh_dnode;
@@ -1100,18 +1110,18 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
continue;
} else if (DN_SLOT_IS_PTR(dn)) {
mutex_enter(&dn->dn_mtx);
- dmu_object_type_t type = dn->dn_type;
+ boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+ zfs_refcount_is_zero(&dn->dn_holds) &&
+ !DNODE_IS_DIRTY(dn));
mutex_exit(&dn->dn_mtx);
- if (type != DMU_OT_NONE)
+ if (!can_free)
return (B_FALSE);
-
- continue;
+ else
+ continue;
} else {
return (B_FALSE);
}
-
- return (B_FALSE);
}
return (B_TRUE);
@@ -1634,7 +1644,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
/*
* If we are already marked dirty, we're done.
*/
- if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
multilist_sublist_unlock(mls);
return;
}
diff --git a/uts/common/fs/zfs/sys/dmu_impl.h b/uts/common/fs/zfs/sys/dmu_impl.h
index 0930be6a8506..e820fe57ecb3 100644
--- a/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/uts/common/fs/zfs/sys/dmu_impl.h
@@ -162,7 +162,7 @@ extern "C" {
* dn_allocated_txg
* dn_free_txg
* dn_assigned_txg
- * dd_assigned_tx
+ * dn_dirty_txg
* dn_notxholds
* dn_dirtyctx
* dn_dirtyctx_firstset
diff --git a/uts/common/fs/zfs/sys/dnode.h b/uts/common/fs/zfs/sys/dnode.h
index 68872a8e9e88..e2c661163212 100644
--- a/uts/common/fs/zfs/sys/dnode.h
+++ b/uts/common/fs/zfs/sys/dnode.h
@@ -296,6 +296,7 @@ struct dnode {
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;
uint64_t dn_assigned_txg;
+ uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
kcondvar_t dn_notxholds;
enum dnode_dirtycontext dn_dirtyctx;
uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
@@ -399,6 +400,9 @@ void dnode_evict_bonus(dnode_t *dn);
void dnode_free_interior_slots(dnode_t *dn);
boolean_t dnode_needs_remap(const dnode_t *dn);
+#define DNODE_IS_DIRTY(_dn) \
+ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
+
#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(DMU_OT_IS_METADATA((_dn)->dn_type) && \