diff options
author | Martin Matuska <mm@FreeBSD.org> | 2012-07-18 08:12:04 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2012-07-18 08:12:04 +0000 |
commit | af56e8c4b416d774961b41eee1eb349d657ebb8c (patch) | |
tree | e332d1e6089905f45302dedddb9967a87ade136a /uts/common/fs/zfs | |
parent | 93a00b0821525e25814cd720fafd04d600811c28 (diff) | |
download | src-af56e8c4b416d774961b41eee1eb349d657ebb8c.tar.gz src-af56e8c4b416d774961b41eee1eb349d657ebb8c.zip |
Update vendor-sys/opensolaris to last OpenSolaris state (13149:b23a4dab3d50)vendor/opensolaris/20100818vendor/opensolaris
Add ZFS bits to vendor-sys/opensolaris
Obtained from: https://hg.openindiana.org/upstream/oracle/onnv-gate
Notes
Notes:
svn path=/vendor-sys/opensolaris/dist/; revision=238567
svn path=/vendor-sys/opensolaris/20100818/; revision=238568; tag=vendor/opensolaris/20100818
Diffstat (limited to 'uts/common/fs/zfs')
141 files changed, 101983 insertions, 0 deletions
diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c new file mode 100644 index 000000000000..a82718e8bc6e --- /dev/null +++ b/uts/common/fs/zfs/arc.c @@ -0,0 +1,4658 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory pressure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() interface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <vm/anon.h> +#include <sys/fs/swapnode.h> +#include <sys/dnlc.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> +#include <zfs_fletcher.h> + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + +#define ARC_REDUCE_DNLC_PERCENT 3 +uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + +static int arc_dead; + +/* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* + * These tunables are for performance analysis. + */ +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ + kmutex_t arcs_mtx; +} arc_state_t; + +/* The 6 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + kstat_named_t arcstat_recycle_miss; + kstat_named_t arcstat_mutex_miss; + kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "recycle_miss", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)); + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ + +static int arc_no_grow; /* Don't try to grow cache size */ +static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + void *b_thawed; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + uint64_t b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; +}; + +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static arc_buf_hdr_t arc_eviction_hdr; +static void arc_get_data_buf(arc_buf_t *buf); +static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); + +static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 64 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) + +uint64_t zfs_crc64_table[256]; + +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + +static uint64_t +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) +{ + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spa>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + +static arc_buf_hdr_t * +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t i; + + ASSERT(!HDR_IN_HASH_TABLE(buf)); + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + ASSERT(BUF_EMPTY(buf)); + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (!arc_dead) + cv_signal(&arc_reclaim_thr_cv); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + */ + while (hsize * 65536 < physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static void +arc_cksum_verify(arc_buf_t *buf) +{ + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || + (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + +static void +arc_cksum_compute(arc_buf_t *buf, boolean_t force) +{ + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + buf->b_hdr->b_freeze_cksum); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + buf->b_hdr->b_freeze_cksum = NULL; + } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + kmutex_t *hash_lock; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || + buf->b_hdr->b_state == arc_anon); + arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); +} + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc_anon)) { + uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); + mutex_enter(&ab->b_state->arcs_mtx); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(list, ab); + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&ab->b_state->arcs_mtx); + /* remove the prefetch flag if we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = ab->b_state; + + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&state->arcs_list[ab->b_type], ab); + ASSERT(ab->b_datacnt > 0); + atomic_add_64(size, ab->b_size * ab->b_datacnt); + mutex_exit(&state->arcs_mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +{ + arc_state_t *old_state = ab->b_state; + int64_t refcnt = refcount_count(&ab->b_refcnt); + uint64_t from_delta, to_delta; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&old_state->arcs_mtx); + + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&old_state->arcs_list[ab->b_type], ab); + + /* + * If prefetching out of the ghost cache, + * we will have a non-zero datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->arcs_mtx); + } + if (new_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&new_state->arcs_mtx); + + list_insert_head(&new_state->arcs_list[ab->b_type], ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(size, to_delta); + + if (use_mutex) + mutex_exit(&new_state->arcs_mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->arcs_size, to_delta); + if (from_delta) { + ASSERT3U(old_state->arcs_size, >=, from_delta); + atomic_add_64(&old_state->arcs_size, -from_delta); + } + ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = spa_guid(spa); + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + hdr->b_datacnt = 1; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + return (buf); +} + +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(buf->b_data != NULL); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); +} + +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = hdr->b_size; + + ASSERT(hdr->b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + arc_get_data_buf(buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_datacnt += 1; + return (buf); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + /* + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. + */ + mutex_enter(&buf->b_evict_lock); + if (buf->b_data == NULL) { + mutex_exit(&buf->b_evict_lock); + return; + } + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); +} + +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_cksum_verify(buf); + + if (!recycle) { + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); + atomic_add_64(&arc_size, -size); + } + } + if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc_anon); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); + } + ASSERT3U(state->arcs_size, >=, size); + atomic_add_64(&state->arcs_size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + buf->b_next = NULL; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + + if (l2hdr != NULL) { + boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + if (!buflist_held) { + mutex_enter(&l2arc_buflist_mtx); + l2hdr = hdr->b_l2hdr; + } + + if (l2hdr != NULL) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!buflist_held) + mutex_exit(&l2arc_buflist_mtx); + } + + if (!BUF_EMPTY(hdr)) { + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + buf_discard_identity(hdr); + } + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE, FALSE); + hdr->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + } + } + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } + + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + int hashed = hdr->b_state != arc_anon; + + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + arc_buf_destroy(buf, FALSE, TRUE); + } else { + ASSERT(buf == hdr->b_buf); + ASSERT(buf->b_efunc == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + mutex_exit(hash_lock); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) + arc_buf_destroy(buf, FALSE, TRUE); + else + arc_hdr_destroy(hdr); + } +} + +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); + + if (hdr->b_state == arc_anon) { + ASSERT(hdr->b_datacnt == 1); + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT(hdr->b_state != arc_anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, FALSE, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(buf->b_efunc == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + * If the recycle flag is set, then attempt to "recycle" a buffer: + * - look for a buffer to evict that is `bytes' long. + * - return the data block from this buffer rather than freeing it. + * This flag is used by callers that are trying to make space for a + * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. + */ +static void * +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; + kmutex_t *hash_lock; + boolean_t have_lock; + void *stolen = NULL; + + ASSERT(state == arc_mru || state == arc_mfu); + + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || + (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + ddi_get_lbolt() - ab->b_arc_access < + arc_min_prefetch_lifespan)) { + skipped++; + continue; + } + /* "lookahead" for better eviction candidate */ + if (recycle && ab->b_size != bytes && + ab_prev && ab_prev->b_size == bytes) + continue; + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + missed += 1; + break; + } + if (buf->b_data) { + bytes_evicted += ab->b_size; + if (recycle && ab->b_type == type && + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { + stolen = buf->b_data; + recycle = FALSE; + } + } + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + arc_buf_destroy(buf, + buf->b_data == stolen, FALSE); + ab->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, + buf->b_data == stolen, TRUE); + } + } + + if (ab->b_l2hdr) { + ARCSTAT_INCR(arcstat_evict_l2_cached, + ab->b_size); + } else { + if (l2arc_write_eligible(ab->b_spa, ab)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + ab->b_size); + } else { + ARCSTAT_INCR( + arcstat_evict_l2_ineligible, + ab->b_size); + } + } + + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } + if (!have_lock) + mutex_exit(hash_lock); + if (bytes >= 0 && bytes_evicted >= bytes) + break; + } else { + missed += 1; + } + } + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + if (skipped) + ARCSTAT_INCR(arcstat_evict_skip, skipped); + + if (missed) + ARCSTAT_INCR(arcstat_mutex_miss, missed); + + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + + return (stolen); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +{ + arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker = { 0 }; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; + kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint64_t bufs_skipped = 0; + + ASSERT(GHOST_STATE(state)); +top: + mutex_enter(&state->arcs_mtx); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + + hash_lock = HDR_LOCK(ab); + /* caller may be trying to modify this buffer, skip it */ + if (MUTEX_HELD(hash_lock)) + continue; + if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); + ASSERT(ab->b_buf == NULL); + ARCSTAT_BUMP(arcstat_deleted); + bytes_deleted += ab->b_size; + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else + bufs_skipped += 1; + } + mutex_exit(&state->arcs_mtx); + + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + + if (bufs_skipped) { + ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t adjustment, delta; + + /* + * Adjust MRU size + */ + + adjustment = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } + + /* + * Adjust MFU size + */ + + adjustment = arc_size - arc_c; + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } + + /* + * Adjust ghost lists + */ + + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, NULL, delta); + } + + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, NULL, delta); + } +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = NULL; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + + if (buf->b_efunc != NULL) + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + +/* + * Flush all *evictable* data from the cache for the given spa. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(spa_t *spa) +{ + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(spa || arc_eviction_list == NULL); +} + +void +arc_shrink(void) +{ + if (arc_c > arc_c_min) { + uint64_t to_free; + +#ifdef _KERNEL + to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); +#else + to_free = arc_c >> arc_shrink_shift; +#endif + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_c > arc_size) + arc_c = MAX(arc_size, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (arc_size > arc_c) + arc_adjust(); +} + +static int +arc_reclaim_needed(void) +{ + uint64_t extra; + +#ifdef _KERNEL + + if (needfree) + return (1); + + /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; + +#ifdef _KERNEL + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + /* + * An aggressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. + */ + if (strat == ARC_RECLAIM_AGGR) + arc_shrink(); + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); +} + +static void +arc_reclaim_thread(void) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc_no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); + + arc_kmem_reap_now(last_reclaim); + arc_warm = B_TRUE; + + } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { + arc_no_grow = FALSE; + } + + arc_adjust(); + + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + + if (state == arc_l2c_only) + return; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? + 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ + + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + uint64_t delta; + + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? + 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = MIN(mult, 10); + + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); + } + ASSERT((int64_t)arc_p >= 0); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if the cache has reached its limits and eviction is required + * prior to insert. + */ +static int +arc_evict_needed(arc_buf_contents_t type) +{ + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif + + if (arc_reclaim_needed()) + return (1); + + return (arc_size > arc_c); +} + +/* + * The buffer, supplied as the first argument, needs a data block. + * So, if we are at cache max, determine which cache should be victimized. + * We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_get_data_buf(arc_buf_t *buf) +{ + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_adapt(size, state); + + /* + * We have not yet reached cache maximum size, + * just allocate a new buffer. + */ + if (!arc_evict_needed(type)) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); + atomic_add_64(&arc_size, size); + } + goto out; + } + + /* + * If we are prefetching from the mfu ghost list, this buffer + * will end up on the mru list; so steal space from there. + */ + if (state == arc_mfu_ghost) + state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + else if (state == arc_mru_ghost) + state = arc_mru; + + if (state == arc_mru || state == arc_anon) { + uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; + state = (arc_mfu->arcs_lsize[type] >= size && + arc_p > mru_used) ? arc_mfu : arc_mru; + } else { + /* MFU cases */ + uint64_t mfu_space = arc_c - arc_p; + state = (arc_mru->arcs_lsize[type] >= size && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + } + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); + atomic_add_64(&arc_size, size); + } + ARCSTAT_BUMP(arcstat_recycle_miss); + } + ASSERT(buf->b_data != NULL); +out: + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(buf->b_hdr->b_state)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + + atomic_add_64(&hdr->b_state->arcs_size, size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + } + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (arc_size < arc_c && hdr->b_state == arc_anon && + arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + arc_p = MIN(arc_c, arc_p + size); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + clock_t now; + + ASSERT(MUTEX_HELD(hash_lock)); + + if (buf->b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc_mru, buf, hash_lock); + + } else if (buf->b_state == arc_mru) { + now = ddi_get_lbolt(); + + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + } else { + buf->b_flags &= ~ARC_PREFETCH; + ARCSTAT_BUMP(arcstat_mru_hits); + } + buf->b_arc_access = now; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (now > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (buf->b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc_mru; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + } + + buf->b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (buf->b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + } + ARCSTAT_BUMP(arcstat_mfu_hits); + buf->b_arc_access = ddi_get_lbolt(); + } else if (buf->b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc_mru; + } + + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); +} + +/* a generic arc_done_func_t */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + VERIFY(arc_buf_remove_ref(buf, arg) == 1); + *bufp = NULL; + } else { + *bufp = buf; + ASSERT(buf->b_data); + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr, *found; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } + + arc_cksum_compute(buf, B_FALSE); + + if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) + abuf = arc_buf_clone(buf); + acb->acb_buf = abuf; + abuf = NULL; + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) { + ASSERT(buf->b_efunc == NULL); + ASSERT(hdr->b_datacnt == 1); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + + if (hash_lock) { + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. + */ +int +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_data_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + rw_exit(&pbuf->b_data_lock); + + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + uint64_t guid = spa_guid(spa); + +top: + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); + if (hdr && hdr->b_datacnt > 0) { + + *arc_flags |= ARC_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, NULL, zio_flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + if (done) { + add_reference(hdr, hash_lock, private); + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } else { + buf = arc_buf_clone(buf); + } + + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); + + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + vdev_t *vd = NULL; + uint64_t addr; + boolean_t devw = B_FALSE; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + buf_discard_identity(hdr); + (void) arc_buf_remove_ref(buf, private); + goto top; /* restart the IO request */ + } + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; + } else { + /* this block is in the ghost cache */ + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; + arc_get_data_buf(buf); + arc_access(hdr, hash_lock); + } + + ASSERT(!GHOST_STATE(hdr->b_state)); + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, size, zbookmark_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, misses); + + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } + } + + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, zio_flags, zb); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_efunc == NULL); + ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); + + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (0); + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ + /* + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. + */ + buf->b_efunc = NULL; + mutex_exit(&buf->b_evict_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); + } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + arc_buf_destroy(buf, FALSE, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&old_state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&old_state->arcs_mtx); + } + mutex_exit(hash_lock); + mutex_exit(&buf->b_evict_lock); + + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + buf->b_next = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock = NULL; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + + if (hdr->b_state == arc_anon) { + /* this buffer is already released */ + ASSERT(buf->b_efunc == NULL); + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + } + + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + + /* + * Do we have more than one buf? + */ + if (hdr->b_datacnt > 1) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + uint64_t spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; + + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + /* + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. + */ + (void) remove_reference(hdr, hash_lock, tag); + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + buf->b_next = NULL; + + ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); + } + hdr->b_datacnt -= 1; + arc_cksum_verify(buf); + + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_type = type; + nhdr->b_buf = buf; + nhdr->b_state = arc_anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; + nhdr->b_datacnt = 1; + nhdr->b_freeze_cksum = NULL; + (void) refcount_add(&nhdr->b_refcnt, tag); + buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); + atomic_add_64(&arc_anon->arcs_size, blksz); + } else { + mutex_exit(&buf->b_evict_lock); + ASSERT(refcount_count(&hdr->b_refcnt) == 1); + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_arc_access = 0; + if (hash_lock) + mutex_exit(hash_lock); + + buf_discard_identity(hdr); + arc_buf_thaw(buf); + } + buf->b_efunc = NULL; + buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } +} + +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + +int +arc_released(arc_buf_t *buf) +{ + int released; + + mutex_enter(&buf->b_evict_lock); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + mutex_exit(&buf->b_evict_lock); + return (released); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + int callback; + + mutex_enter(&buf->b_evict_lock); + callback = (buf->b_efunc != NULL); + mutex_exit(&buf->b_evict_lock); + return (callback); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + int referenced; + + mutex_enter(&buf->b_evict_lock); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + mutex_exit(&buf->b_evict_lock); + return (referenced); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); + } + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(hdr->b_acb == NULL); + + if (zio->io_error == 0) { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } else { + ASSERT(BUF_EMPTY(hdr)); + } + + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). + */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + ASSERT(zio->io_error == 0); + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else { + /* Dedup */ + ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } + } + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + /* if it's not anon, we are doing a scrub */ + if (!exists && hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + } + + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + + ASSERT(ready != NULL); + ASSERT(done != NULL); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT(hdr->b_acb == NULL); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); + + return (zio); +} + +static int +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t available_memory = ptob(freemem); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == proc_pageout) { + if (page_load > MAX(ptob(minfree), available_memory) / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + +void +arc_tempreserve_clear(uint64_t reserve) +{ + atomic_add_64(&arc_tempreserve, -reserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t reserve, uint64_t txg) +{ + int error; + uint64_t anon_size; + +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) + return (ENOMEM); + + /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, anon_size, txg)) + return (error); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + */ + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, reserve); + return (0); +} + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + + /* Start out with 1/8 of all memory */ + arc_c = physmem * PAGESIZE / 8; + +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif + + /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ + arc_c_min = MAX(arc_c / 4, 64<<20); + /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1<<30) + arc_c_max = (arc_c * 8) - (1<<30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(arc_c * 6, arc_c_max); + + /* + * Allow the tunables to override our calculations if they are + * reasonable (ie. over 64MB) + */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + arc_size = 0; + + mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); + bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + kstat_install(arc_ksp); + } + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(NULL); + + arc_dead = TRUE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + mutex_destroy(&arc_eviction_mtx); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + + mutex_destroy(&arc_anon->arcs_mtx); + mutex_destroy(&arc_mru->arcs_mtx); + mutex_destroy(&arc_mru_ghost->arcs_mtx); + mutex_destroy(&arc_mfu->arcs_mtx); + mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&arc_l2c_only->arcs_mtx); + + mutex_destroy(&zfs_write_limit_lock); + + buf_fini(); + + ASSERT(arc_loaned_bytes == 0); +} + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. + */ + +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + int equal; + + ASSERT(zio->io_vd != NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = EIO; + } + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static uint64_t +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz, headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); + + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (int try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + headroom = target_sz * l2arc_headroom; + if (arc_warm == B_FALSE) + ab = list_head(list); + else + ab = list_tail(list); + + for (; ab; ab = ab_prev) { + if (arc_warm == B_FALSE) + ab_prev = list_next(list, ab); + else + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (!l2arc_write_eligible(guid, ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + /* + * Keep the clock hand suitably device-aligned. + */ + buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return (0); + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + vdev_space_update(dev->l2ad_vdev, + dev->l2ad_end - dev->l2ad_hand, 0, 0); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + dev->l2ad_writing = B_TRUE; + (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + next); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + + /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = l2arc_write_size(dev); + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_boost = l2arc_write_boost; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/uts/common/fs/zfs/bplist.c b/uts/common/fs/zfs/bplist.c new file mode 100644 index 000000000000..066ccc6b1e05 --- /dev/null +++ b/uts/common/fs/zfs/bplist.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + + +void +bplist_create(bplist_t *bpl) +{ + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); +} + +void +bplist_destroy(bplist_t *bpl) +{ + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); +} + +void +bplist_append(bplist_t *bpl, const blkptr_t *bp) +{ + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); + + mutex_enter(&bpl->bpl_lock); + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while (bpe = list_head(&bpl->bpl_list)) { + list_remove(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/uts/common/fs/zfs/bpobj.c b/uts/common/fs/zfs/bpobj.c new file mode 100644 index 000000000000..72be31235607 --- /dev/null +++ b/uts/common/fs/zfs/bpobj.c @@ -0,0 +1,495 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/bpobj.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + i++; + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + i * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) { + mutex_exit(&bpo->bpo_lock); + return (err); + } + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err) + break; + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || + bpo->bpo_phys->bpo_bytes == 0); + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (used == 0) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + } + + mutex_enter(&bpo->bpo_lock); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(!BP_IS_HOLE(bp)); + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + /* The bpobj will compress better if we can leave off the checksum */ + if (!BP_GET_DEDUP(bp)) + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + sra->used += bp_get_dsize_sync(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c new file mode 100644 index 000000000000..9c4e0296db2b --- /dev/null +++ b/uts/common/fs/zfs/dbuf.c @@ -0,0 +1,2707 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> + +static void dbuf_destroy(dmu_buf_impl_t *db); +static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_cache; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + refcount_create(&db->db_holds); + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); + + return (crc); +} + +#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid = db->db_blkid; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, 1); + + return (NULL); +} + +/* + * Remove an entry from the hash table. This operation will + * fail if there are any existing holds on the db. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf, **dbp; + + /* + * We musn't hold db_mtx to maintin lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, -1); +} + +static arc_evict_func_t dbuf_do_evict; + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0 || db->db_evict_func == NULL) + return; + + if (db->db_user_data_ptr_ptr) + *db->db_user_data_ptr_ptr = db->db.db_data; + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; +} + +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + +void +dbuf_evict(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); + + dbuf_clear(db); + dbuf_destroy(db); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). + */ + while (hsize * 4096 < physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_cache); +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dbuf_dirty_record_t *dr; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !list_is_empty(&dn->dn_dbufs)); + } + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, 0); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + */ + if (db->db_dirtycnt == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } + } + DB_DNODE_EXIT(db); +} +#endif + +static void +dbuf_update_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && db->db_user_data_ptr_ptr) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user_data_ptr_ptr = db->db.db_data; + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = buf; + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; + } +} + +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + spa_t *spa; + + mutex_exit(&db->db_mtx); + DB_GET_SPA(&spa, db); + abuf = arc_loan_buf(spa, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + dbuf_set_data(db, NULL); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + +uint64_t +dbuf_whichblock(dnode_t *dn, uint64_t offset) +{ + if (dn->dn_datablkshift) { + return (offset >> dn->dn_datablkshift); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (db->db_level == 0 && db->db_freed_in_flight) { + /* we were freed in flight; disregard any error */ + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else if (zio == NULL || zio->io_error == 0) { + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + db->db_state = DB_UNCACHED; + } + cv_broadcast(&db->db_changed); + dbuf_rele_and_unlock(db, NULL); +} + +static void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +{ + dnode_t *dn; + spa_t *spa; + zbookmark_t zb; + uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + + if (db->db_blkid == DMU_BONUS_BLKID) { + int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + if (bonuslen < DN_MAX_BONUSLEN) + bzero(db->db.db_data, DN_MAX_BONUSLEN); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); + dbuf_update_data(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type)); + DB_DNODE_EXIT(db); + bzero(db->db.db_data, db->db.db_size); + db->db_state = DB_CACHED; + *flags |= DB_RF_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + spa = dn->dn_objset->os_spa; + DB_DNODE_EXIT(db); + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + + SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + dbuf_add_ref(db, NULL); + /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) dsl_read(zio, spa, db->db_blkptr, pbuf, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + int havepzio = (zio != NULL); + int prefetch; + dnode_t *dn; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_NOFILL) + return (EIO); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && + DBUF_IS_CACHEABLE(db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + } else if (db->db_state == DB_UNCACHED) { + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + dbuf_read_impl(db, zio, &flags); + + /* dbuf_read_impl has dropped db_mtx for us */ + + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, flags & DB_RF_CACHED); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + if (!havepzio) + err = zio_wait(zio); + } else { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + mutex_enter(&db->db_mtx); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = EIO; + } + mutex_exit(&db->db_mtx); + } + + ASSERT(err || havepzio || db->db_state == DB_CACHED); + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + db->db_state = DB_FILL; + } else if (db->db_state == DB_NOFILL) { + dbuf_set_data(db, NULL); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +/* + * This is our just-in-time copy function. It makes a copy of + * buffers, that have been modified in a previous transaction + * group, before we modify them in the current active group. + * + * This function is used in two places: when we are dirtying a + * buffer for the first time in a txg, and when we are freeing + * a range in a dnode that includes this buffer. + * + * Note that when we are called from dbuf_free_range() we do + * not put a hold on the buffer, we just traverse the active + * dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + DB_GET_SPA(&spa, db); + dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + dbuf_set_data(db, NULL); + } +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + ASSERT(db->db_data_pending != dr); + + /* free this block */ + if (!BP_IS_HOLE(bp)) { + spa_t *spa; + + DB_GET_SPA(&spa, db); + zio_free(spa, txg, bp); + } + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ +void +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; + + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + + if (db->db_level != 0) + continue; + dprintf_dbuf(db, "found buf %s\n", ""); + if (db->db_blkid < start || db->db_blkid > end) + continue; + + /* found a level 0 buffer in the range */ + if (dbuf_undirty(db, tx)) + continue; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_clear(db); + continue; + } + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +static int +dbuf_block_freeable(dmu_buf_impl_t *db) +{ + dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + uint64_t birth_txg = 0; + + /* + * We don't need any locking to protect db_blkptr: + * If it's syncing, then db_last_dirty will be set + * so we'll ignore db_blkptr. + */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_last_dirty) + birth_txg = db->db_last_dirty->dr_txg; + else if (db->db_blkptr) + birth_txg = db->db_blkptr->blk_birth; + + /* + * If we don't exist or are in a snapshot, we can't be freed. + * Don't pass the bp to dsl_dataset_block_freeable() since we + * are holding the db_mtx lock and might deadlock if we are + * prefetching a dedup-ed block. + */ + if (birth_txg) + return (ds == NULL || + dsl_dataset_block_freeable(ds, NULL, birth_txg)); + else + return (FALSE); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dbuf_will_dirty(db, tx); + + /* create the data buffer for the new block */ + buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + VERIFY(arc_buf_remove_ref(obuf, db) == 1); + db->db.db_size = size; + + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } + mutex_exit(&db->db_mtx); + + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os; + zbookmark_t zb; + + DB_GET_OBJSET(&os, db); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + zb.zb_objset = os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_release_bp(db->db_buf, db, + db->db_blkptr, os->os_spa, &zb); +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + objset_t *os; + dbuf_dirty_record_t **drp, *dr; + int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED && + !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&dn->dn_mtx); + + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + + /* + * If this buffer is already dirty, we're done. + */ + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) + arc_buf_thaw(db->db_buf); + } + mutex_exit(&db->db_mtx); + return (dr); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + os = dn->dn_objset; + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dsize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dsize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + ddt_prefetch(os->os_spa, bp); + dnode_willuse_space(dn, -willfree, tx); + } + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); +} + +static int +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr, **drp; + + ASSERT(txg != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + mutex_enter(&db->db_mtx); + /* + * If this buffer is not dirty, we're done. + */ + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) { + mutex_exit(&db->db_mtx); + return (0); + } + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* + * If this buffer is currently held, we cannot undirty + * it, since one of the current holders may be in the + * middle of an update. Note that users of dbuf_undirty() + * should not place a hold on the dbuf before the call. + */ + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + /* Make sure we don't toss this buffer at sync phase */ + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + DB_DNODE_EXIT(db); + return (0); + } + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + /* XXX would be nice to fix up dn_towrite_space[] */ + + *drp = dr->dr_next; + + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_level+1 == dn->dn_nlevels) { + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + DB_DNODE_EXIT(db); + + if (db->db_level == 0) { + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + } + } else { + ASSERT(db->db_buf != NULL); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + arc_buf_t *buf = db->db_buf; + + ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + return (1); + } + + mutex_exit(&db->db_mtx); + return (0); +} + +#pragma weak dmu_buf_will_dirty = dbuf_will_dirty +void +dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); + (void) dbuf_read(db, NULL, rf); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + + dmu_buf_will_fill(db_fake, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +/* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT(buf != NULL); + ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + xuio_stat_wbuf_copied(); + return; + } + + xuio_stat_wbuf_nocopy(); + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dbuf_fill_done(db, tx); +} + +/* + * "Clear" the contents of this dbuf. This will mark the dbuf + * EVICTING and clear *most* of its references. Unfortunetely, + * when we are not holding the dn_dbufs_mtx, we can't clear the + * entry in the dn_dbufs list. We have to wait until dbuf_destroy() + * in this case. For callers from the DMU we will usually see: + * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * For the arc callback, we will usually see: + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * Sometimes, though, we will get a mix of these two: + * DMU: dbuf_clear()->arc_buf_evict() + * ARC: dbuf_do_evict()->dbuf_destroy() + */ +void +dbuf_clear(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb; + int dbuf_gone = FALSE; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + dbuf_evict_user(db); + + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + if (db->db_blkid == DMU_BONUS_BLKID) { + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } + + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_data_pending == NULL); + + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ + dnode_rele(dn, db); + db->db_dnode_handle = NULL; + } else { + DB_DNODE_EXIT(db); + } + + if (db->db_buf) + dbuf_gone = arc_buf_evict(db->db_buf); + + if (!dbuf_gone) + mutex_exit(&db->db_mtx); + + /* + * If this dbuf is referenced from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) + dbuf_rele(parent, db); +} + +static int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + int nlevels, epbs; + + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = &dn->dn_phys->dn_spill; + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } + + if (dn->dn_phys->dn_nlevels == 0) + nlevels = 1; + else + nlevels = dn->dn_phys->dn_nlevels; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (level >= nlevels || + (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (ENOENT); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, NULL, parentp); + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_last_dirty = NULL; + db->db_dirtycnt = 0; + db->db_dnode_handle = dn->dn_handle; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; + db->db_immediate_evict = 0; + db->db_freed_in_flight = 0; + + if (blkid == DMU_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + db->db.db_offset = DMU_BONUS_BLKID; + db->db_state = DB_UNCACHED; + /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; + } else { + int blocksize = + db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + return (odb); + } + list_insert_head(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; + mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, db); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +static int +dbuf_do_evict(void *private) +{ + arc_buf_t *buf = private; + dmu_buf_impl_t *db = buf->b_private; + + if (!MUTEX_HELD(&db->db_mtx)) + mutex_enter(&db->db_mtx); + + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_state != DB_EVICTING) { + ASSERT(db->db_state == DB_CACHED); + DBUF_VERIFY(db); + db->db_buf = NULL; + dbuf_evict(db); + } else { + mutex_exit(&db->db_mtx); + dbuf_destroy(db); + } + return (0); +} + +static void +dbuf_destroy(dmu_buf_impl_t *db) +{ + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this dbuf is still on the dn_dbufs list, + * remove it from that list. + */ + if (db->db_dnode_handle != NULL) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + mutex_enter(&dn->dn_dbufs_mtx); + list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + mutex_exit(&dn->dn_dbufs_mtx); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold + * corresponding to the removed dbuf is no longer + * discounted in dnode_move(), so the dnode cannot be + * moved until after we release the hold. + */ + dnode_rele(dn, db); + db->db_dnode_handle = NULL; + } + dbuf_hash_remove(db); + } + db->db_parent = NULL; + db->db_buf = NULL; + + ASSERT(!list_link_active(&db->db_link)); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + + kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); +} + +void +dbuf_prefetch(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db = NULL; + blkptr_t *bp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (dnode_block_freed(dn, blkid)) + return; + + /* dbuf_find() returns with db_mtx held */ + if (db = dbuf_find(dn, 0, blkid)) { + /* + * This dbuf is already in the cache. We assume that + * it is already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } + + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { + if (bp && !BP_IS_HOLE(bp)) { + int priority = dn->dn_type == DMU_OT_DDT_ZAP ? + ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; + arc_buf_t *pbuf; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, 0, blkid); + + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) dsl_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, priority, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + if (db) + dbuf_rele(db, NULL); + } +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; +top: + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = ENOENT; + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } + + if (db->db_buf && refcount_is_zero(&db->db_holds)) { + arc_buf_add_ref(db->db_buf, db); + if (db->db_buf->b_data == NULL) { + dbuf_clear(db); + if (parent) { + dbuf_rele(parent, NULL); + parent = NULL; + } + goto top; + } + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, + arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type)); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); + } + } + + (void) refcount_add(&db->db_holds, tag); + dbuf_update_data(db); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(DB_DNODE(db), ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +void +dbuf_create_bonus(dnode_t *dn) +{ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (ENOTSUP); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + if (blksz > SPA_MAXBLOCKSIZE) + blksz = SPA_MAXBLOCKSIZE; + else + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT(holds > 1); +} + +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ +#pragma weak dmu_buf_rele = dbuf_rele +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + DBUF_VERIFY(db); + + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ + holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + arc_buf_freeze(db->db_buf); + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DMU_BONUS_BLKID) { + mutex_exit(&db->db_mtx); + + /* + * If the dnode moves here, we cannot cross this barrier + * until the move completes. + */ + DB_DNODE_ENTER(db); + (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); + DB_DNODE_EXIT(db); + /* + * The bonus buffer's dnode hold is no longer discounted + * in dnode_move(). The dnode cannot move until after + * the dnode_rele(). + */ + dnode_rele(DB_DNODE(db), db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + dbuf_evict(db); + } else if (arc_released(db->db_buf)) { + arc_buf_t *buf = db->db_buf; + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + } else { + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); + } + } else { + mutex_exit(&db->db_mtx); + } +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_evict_func = evict_func; + + dbuf_update_data(db); + } else { + old_user_ptr = db->db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user_ptr); +} + +boolean_t +dmu_buf_freeable(dmu_buf_t *dbuf) +{ + boolean_t res = B_FALSE; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + + if (db->db_blkptr) + res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, + db->db_blkptr, db->db_blkptr->blk_birth); + + return (res); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = &dn->dn_phys->dn_spill; + BP_ZERO(db->db_blkptr); + return; + } + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, db, &parent); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_buf != NULL); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); + + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + dbuf_write(dr, db->db_buf, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + uint64_t txg = tx->tx_txg; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + } + DBUF_VERIFY(db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_dirty_record_t **drp; + + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + + if (*datap != db->db.db_data) { + zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db_data_pending = NULL; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT(dr->dr_next == NULL); + ASSERT(dr->dr_dbuf == db); + *drp = dr->dr_next; + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + return; + } + + os = dn->dn_objset; + + /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* + * If this buffer is in the middle of an immediate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + int blksz = arc_buf_size(*datap); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); + } + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, *datap, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) { + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); + zio_nowait(dr->dr_zio); + } +} + +void +dbuf_sync_list(list_t *list, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + spa_t *spa = zio->io_spa; + int64_t delta; + uint64_t fill = 0; + int i; + + ASSERT(db->db_blkptr == bp); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; + + if (BP_IS_HOLE(bp)) { + ASSERT(bp->blk_fill == 0); + DB_DNODE_EXIT(db); + return; + } + + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + + mutex_enter(&db->db_mtx); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + dnode_phys_t *dnp = db->db.db_data; + for (i = db->db.db_size >> DNODE_SHIFT; i > 0; + i--, dnp++) { + if (dnp->dn_type != DMU_OT_NONE) + fill++; + } + } else { + fill = 1; + } + } else { + blkptr_t *ibp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) + continue; + fill += ibp->blk_fill; + } + } + DB_DNODE_EXIT(db); + + bp->blk_fill = fill; + + mutex_exit(&db->db_mtx); +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint64_t txg = zio->io_txg; + dbuf_dirty_record_t **drp, *dr; + + ASSERT3U(zio->io_error, ==, 0); + ASSERT(db->db_blkptr == bp); + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + objset_t *os; + dsl_dataset_t *ds; + dmu_tx_t *tx; + + DB_GET_OBJSET(&os, db); + ds = os->os_dsl_dataset; + tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + + mutex_enter(&db->db_mtx); + + DBUF_VERIFY(db); + + drp = &db->db_last_dirty; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } +#endif + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + else if (!arc_released(db->db_buf)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid + >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + DB_DNODE_EXIT(db); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } + mutex_exit(&db->db_mtx); + + dbuf_write_done(zio, NULL, db); +} + +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_prop_t zp; + zio_t *zio; + int wp_flag = 0; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + dbuf_release_bp(db); + } + } + } + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + ASSERT(db->db_state != DB_NOFILL); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + dbuf_write_override_ready, dbuf_write_override_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, NULL, db->db.db_size, &zp, + dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + dr->dr_zio = arc_write(zio, os->os_spa, txg, + db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, + dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } +} diff --git a/uts/common/fs/zfs/ddt.c b/uts/common/fs/zfs/ddt.c new file mode 100644 index 000000000000..718331496765 --- /dev/null +++ b/uts/common/fs/zfs/ddt.c @@ -0,0 +1,1146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dsl_scan.h> + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_object_count(ddt, type, class) == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(error == 0); + return (error); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +uint64_t +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class])); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +int +ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) +{ + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); +} + +int +ddt_ditto_copies_present(ddt_entry_t *dde) +{ + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) +{ + return (spa->spa_ddt[c]); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_data != NULL) + zio_buf_free(dde->dde_repair_data, + DDK_GET_PSIZE(&dde->dde_key)); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) + break; + } + if (error != ENOENT) + break; + } + + ASSERT(error == 0 || error == ENOENT); + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; + + for (int i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object, tx) == 0); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + count += ddt_object_count(ddt, type, class); + } + } + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (count == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + zio_t *rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); +} diff --git a/uts/common/fs/zfs/ddt_zap.c b/uts/common/fs/zfs/ddt_zap.c new file mode 100644 index 000000000000..d6a991c7c19e --- /dev/null +++ b/uts/common/fs/zfs/ddt_zap.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <util/sscanf.h> + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? ENOTSUP : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t one, csize; + int error; + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + return (error); + + ASSERT(one == 1); + ASSERT(csize <= sizeof (cbuf)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + return (error); + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + + return (0); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + zap_cursor_init_serialized(&zc, os, object, *walk); + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static uint64_t +ddt_zap_count(objset_t *os, uint64_t object) +{ + uint64_t count = 0; + + VERIFY(zap_count(os, object, &count) == 0); + + return (count); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_prefetch, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c new file mode 100644 index 000000000000..39234eba53b2 --- /dev/null +++ b/uts/common/fs/zfs/dmu.c @@ -0,0 +1,1764 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/sa.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <sys/zfs_znode.h> +#endif + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { byteswap_uint8_array, TRUE, "unallocated" }, + { zap_byteswap, TRUE, "object directory" }, + { byteswap_uint64_array, TRUE, "object array" }, + { byteswap_uint8_array, TRUE, "packed nvlist" }, + { byteswap_uint64_array, TRUE, "packed nvlist size" }, + { byteswap_uint64_array, TRUE, "bpobj" }, + { byteswap_uint64_array, TRUE, "bpobj header" }, + { byteswap_uint64_array, TRUE, "SPA space map header" }, + { byteswap_uint64_array, TRUE, "SPA space map" }, + { byteswap_uint64_array, TRUE, "ZIL intent log" }, + { dnode_buf_byteswap, TRUE, "DMU dnode" }, + { dmu_objset_byteswap, TRUE, "DMU objset" }, + { byteswap_uint64_array, TRUE, "DSL directory" }, + { zap_byteswap, TRUE, "DSL directory child map"}, + { zap_byteswap, TRUE, "DSL dataset snap map" }, + { zap_byteswap, TRUE, "DSL props" }, + { byteswap_uint64_array, TRUE, "DSL dataset" }, + { zfs_znode_byteswap, TRUE, "ZFS znode" }, + { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, + { byteswap_uint8_array, FALSE, "ZFS plain file" }, + { zap_byteswap, TRUE, "ZFS directory" }, + { zap_byteswap, TRUE, "ZFS master node" }, + { zap_byteswap, TRUE, "ZFS delete queue" }, + { byteswap_uint8_array, FALSE, "zvol object" }, + { zap_byteswap, TRUE, "zvol prop" }, + { byteswap_uint8_array, FALSE, "other uint8[]" }, + { byteswap_uint64_array, FALSE, "other uint64[]" }, + { zap_byteswap, TRUE, "other ZAP" }, + { zap_byteswap, TRUE, "persistent error log" }, + { byteswap_uint8_array, TRUE, "SPA history" }, + { byteswap_uint64_array, TRUE, "SPA history offsets" }, + { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "DSL permissions" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, + { byteswap_uint8_array, TRUE, "FUID table" }, + { byteswap_uint64_array, TRUE, "FUID table size" }, + { zap_byteswap, TRUE, "DSL dataset next clones"}, + { zap_byteswap, TRUE, "scan work queue" }, + { zap_byteswap, TRUE, "ZFS user/group used" }, + { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, + { zap_byteswap, TRUE, "DDT ZAP algorithm" }, + { zap_byteswap, TRUE, "DDT statistics" }, + { byteswap_uint8_array, TRUE, "System attributes" }, + { zap_byteswap, TRUE, "SA master node" }, + { zap_byteswap, TRUE, "SA attr registration" }, + { zap_byteswap, TRUE, "SA attr layouts" }, + { zap_byteswap, TRUE, "scan translations" }, + { byteswap_uint8_array, FALSE, "deduplicated block" }, + { zap_byteswap, TRUE, "DSL deadlist map" }, + { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, + { zap_byteswap, TRUE, "DSL dir clones" }, + { byteswap_uint64_array, TRUE, "bpobj subobj" }, +}; + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + blkid = dbuf_whichblock(dn, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) { + err = EIO; + } else { + err = dbuf_read(db, NULL, db_flags); + if (err) { + dbuf_rele(db, tag); + db = NULL; + } + } + + dnode_rele(dn, FTAG); + *dbp = &db->db; /* NULL db plus first field offset is NULL */ + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_MAX_BONUSLEN); +} + +int +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = EINVAL; + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = EINVAL; + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (type > DMU_OT_NUMTYPES) { + error = EINVAL; + } else if (dn->dn_bonus != db) { + error = EINVAL; + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * returns ENOENT, EIO, or 0. + */ +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + dmu_buf_impl_t *db; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) { + VERIFY(dnode_add_ref(dn, db)); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); + + *dbp = &db->db; + return (0); +} + +/* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = EINVAL; + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = ENOENT; + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +static int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) +{ + dsl_pool_t *dp = NULL; + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t dbuf_flags; + int err; + zio_t *zio; + hrtime_t start; + + ASSERT(length <= DMU_MAX_ACCESS); + + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - + P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); + return (EIO); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (EIO); + } + /* initiate async i/o */ + if (read) { + (void) dbuf_read(db, zio, dbuf_flags); + } + dbp[i] = &db->db; + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +void +dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, i, err; + + if (zfs_prefetch_disable) + return; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = DMU_META_DNODE(os); + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, blkid); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - + P2ALIGN(offset, 1<<blkshift)) >> blkshift; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) + dbuf_prefetch(dn, blkid+i); + } + + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + */ +static int +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) +{ + uint64_t len = *start - limit; + uint64_t blkcnt = 0; + uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t iblkrange = + dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT(limit <= *start); + + if (len <= iblkrange * maxblks) { + *start = limit; + return (0); + } + ASSERT(ISP2(iblkrange)); + + while (*start > limit && blkcnt < maxblks) { + int err; + + /* find next allocated L1 indirect */ + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, start, 2, 1, 0); + + /* if there are no more, then we are done */ + if (err == ESRCH) { + *start = limit; + return (0); + } else if (err) { + return (err); + } + blkcnt += 1; + + /* reset offset to end of "next" block back */ + *start = P2ALIGN(*start, iblkrange); + if (*start <= limit) + *start = limit; + else + *start -= 1; + } + return (0); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t free_dnode) +{ + dmu_tx_t *tx; + uint64_t object_size, start, end, len; + boolean_t trunc = (length == DMU_OBJECT_END); + int align, err; + + align = 1 << dn->dn_datablkshift; + ASSERT(align > 0); + object_size = align == 1 ? dn->dn_datablksz : + (dn->dn_maxblkid + 1) << dn->dn_datablkshift; + + end = offset + length; + if (trunc || end > object_size) + end = object_size; + if (end <= offset) + return (0); + length = end - offset; + + while (length) { + start = end; + /* assert(offset <= start) */ + err = get_next_chunk(dn, &start, offset); + if (err) + return (err); + len = trunc ? DMU_OBJECT_END : end - start; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, start, len); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dnode_free_range(dn, start, trunc ? -1 : len, tx); + + if (start == 0 && free_dnode) { + ASSERT(trunc); + dnode_free(dn, tx); + } + + length -= end - start; + + dmu_tx_commit(tx); + end = start; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_object(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_tx_t *tx; + int err; + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err != 0) + return (err); + if (dn->dn_nlevels == 1) { + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } else { + err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + } + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + ASSERT(offset < UINT64_MAX); + ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); + return (0); +} + +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dnode_t *dn; + dmu_buf_t **dbp; + int numbufs, err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_maxblkid == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp, flags); + if (err) + break; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + bcopy(buf, (char *)db->db_data + bufoff, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_size(abuf)); + iov = uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + +#ifdef _KERNEL +int +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; + xuio_t *xuio = NULL; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, + &numbufs, &dbp); + if (err) + return (err); + + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + } + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +static int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + int err = 0; + int i; + + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio->uio_loffset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + /* + * XXX uiomove could block forever (eg. nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + size -= tocpy; + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); +} + +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_write_uio_dnode(dn, uio, size, tx); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + page_t *pp, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(pp, S_READ); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(pp, va); + pp = pp->p_next; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} +#endif + +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + spa_t *spa; + + DB_GET_SPA(&spa, db); + return (arc_loan_buf(spa, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dnode_t *dn; + dmu_buf_impl_t *db; + uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint64_t blkid; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, offset); + VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(dbuf); + + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + objset_t *os; + uint64_t object; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + os = dn->dn_objset; + object = dn->dn_object; + DB_DNODE_EXIT(dbuf); + + dbuf_rele(db, FTAG); + dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); + } +} + +typedef struct { + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +/* ARGSUSED */ +static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else { + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } + } +} + +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + if (zio->io_error == 0) { + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static void +dmu_sync_late_arrival_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; + + if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + + dmu_tx_commit(dsa->dsa_tx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_t *zb) +{ + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + return (EIO); /* Make zl_get_data do txg_waited_synced() */ + } + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; + + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); +} + +/* + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to to. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). + * + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). + */ +int +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) +{ + blkptr_t *bp = zgd->zgd_bp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + dbuf_dirty_record_t *dr; + dmu_sync_arg_t *dsa; + zbookmark_t zb; + zio_prop_t zp; + dnode_t *dn; + + ASSERT(pio != NULL); + ASSERT(BP_IS_HOLE(bp)); + ASSERT(txg != 0); + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); + + /* + * If we're frozen (running ziltest), we always need to generate a bp. + */ + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + + /* + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. + */ + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { + /* + * This txg has already synced. There's nothing to do. + */ + mutex_exit(&db->db_mtx); + return (EEXIST); + } + + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + } + + dr = db->db_last_dirty; + while (dr && dr->dr_txg != txg) + dr = dr->dr_next; + + if (dr == NULL) { + /* + * There's no dr for this dbuf, so it must have been freed. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + return (ENOENT); + } + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + mutex_exit(&db->db_mtx); + return (EALREADY); + } + + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; + + zio_nowait(arc_write(pio, os->os_spa, txg, + bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, + dmu_sync_ready, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + + return (0); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os, object, FTAG, &dn); + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os, object, FTAG, &dn); + ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +int zfs_mdcomp_disable = 0; + +void +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +{ + dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; + boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + (wp & WP_SPILL)); + enum zio_checksum checksum = os->os_checksum; + enum zio_compress compress = os->os_compress; + enum zio_checksum dedup_checksum = os->os_dedup_checksum; + boolean_t dedup; + boolean_t dedup_verify = os->os_dedup_verify; + int copies = os->os_copies; + + /* + * Determine checksum setting. + */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[checksum].ci_correctable < 1 || + zio_checksum_table[checksum].ci_eck) + checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + checksum = zio_checksum_select(dn->dn_checksum, checksum); + } + + /* + * Determine compression setting. + */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + compress = zio_compress_select(dn->dn_compress, compress); + } + + /* + * Determine dedup setting. If we are in dmu_sync(), we won't + * actually dedup now because that's all done in syncing context; + * but we do want to use the dedup checkum. If the checksum is not + * strong enough to ensure unique signatures, force dedup_verify. + */ + dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); + if (dedup) { + checksum = dedup_checksum; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = 1; + } + + if (wp & WP_DMU_SYNC) + dedup = 0; + + if (wp & WP_NOFILL) { + ASSERT(!ismd && level == 0); + checksum = ZIO_CHECKSUM_OFF; + compress = ZIO_COMPRESS_OFF; + dedup = B_FALSE; + } + + zp->zp_checksum = checksum; + zp->zp_compress = compress; + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; + zp->zp_level = level; + zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); + zp->zp_dedup = dedup; + zp->zp_dedup_verify = dedup && dedup_verify; +} + +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int i, err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + /* + * Sync any current changes before + * we go trundling through the block pointers. + */ + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) + break; + } + if (i != TXG_SIZE) { + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + } + + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); + dnode_rele(dn, FTAG); + + return (err); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + dnode_phys_t *dnp; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + dnp = dn->dn_phys; + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_type = dn->dn_type; + doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + + if (err) + return (err); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); +} + +/* + * Faster still when you only care about the size. + * This is specifically optimized for zfs_getattr(). + */ +void +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + *blksize = dn->dn_datablksz; + /* add 1 for dnode space */ + *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> + SPA_MINBLOCKSHIFT) + 1; + DB_DNODE_EXIT(db); +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + zfs_dbgmsg_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); + dnode_init(); + dbuf_init(); + zfetch_init(); + arc_init(); + l2arc_init(); +} + +void +dmu_fini(void) +{ + l2arc_fini(); + arc_fini(); + zfetch_fini(); + dbuf_fini(); + dnode_fini(); + dmu_objset_fini(); + xuio_stat_fini(); + sa_cache_fini(); + zfs_dbgmsg_fini(); +} diff --git a/uts/common/fs/zfs/dmu_diff.c b/uts/common/fs/zfs/dmu_diff.c new file mode 100644 index 000000000000..22340ebc5374 --- /dev/null +++ b/uts/common/fs/zfs/dmu_diff.c @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> + +struct diffarg { + struct vnode *da_vp; /* file to which we are reporting */ + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; +}; + +static int +write_record(struct diffarg *da) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + + da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr, + sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct diffarg *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + if (zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (bp == NULL) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + uint32_t aflags = ARC_WAIT; + int blksz = BP_GET_LSIZE(bp); + int i; + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + (void) arc_buf_remove_ref(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) +{ + struct diffarg da; + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; + dsl_dataset_t *findds; + dsl_dataset_t *relds; + int err = 0; + + /* make certain we are looking at snapshots */ + if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) + return (EINVAL); + + /* fromsnap must be earlier and from the same lineage as tosnap */ + if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) + return (EXDEV); + + relds = NULL; + findds = ds; + + while (fromds->ds_dir != findds->ds_dir) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (!dsl_dir_is_clone(findds->ds_dir)) { + if (relds) + dsl_dataset_rele(relds, FTAG); + return (EXDEV); + } + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); + rw_exit(&dp->dp_config_rwlock); + + if (relds) + dsl_dataset_rele(relds, FTAG); + + if (err) + return (EXDEV); + + relds = findds; + } + + if (relds) + dsl_dataset_rele(relds, FTAG); + + da.da_vp = vp; + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + + err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + + if (err) { + da.da_err = err; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + return (da.da_err); +} diff --git a/uts/common/fs/zfs/dmu_object.c b/uts/common/fs/zfs/dmu_object.c new file mode 100644 index 000000000000..8dff46048902 --- /dev/null +++ b/uts/common/fs/zfs/dmu_object.c @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t object; + uint64_t L2_dnode_count = DNODES_PER_BLOCK << + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn = NULL; + int restarted = B_FALSE; + + mutex_enter(&os->os_obj_lock); + for (;;) { + object = os->os_obj_next; + /* + * Each time we polish off an L2 bp worth of dnodes + * (2^13 objects), move to another L2 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning once, but after that keep looking from here. + * If we can't find one, just keep going from here. + */ + if (P2PHASE(object, L2_dnode_count) == 0) { + uint64_t offset = restarted ? object << DNODE_SHIFT : 0; + int error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, + &offset, 2, DNODES_PER_BLOCK >> 2, 0); + restarted = B_TRUE; + if (error == 0) + object = offset >> DNODE_SHIFT; + } + os->os_obj_next = ++object; + + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + FTAG, &dn); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE, 0) == 0) + os->os_obj_next = object - 1; + } + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + mutex_exit(&os->os_obj_lock); + + dmu_tx_add_new_object(tx, os, object); + return (object); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (EBADF); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + if (err) + return (err); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + dmu_tx_add_new_object(tx, os, object); + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen) +{ + dnode_t *dn; + dmu_tx_t *tx; + int nblkptr; + int err; + + if (object == DMU_META_DNODE_OBJECT) + return (EBADF); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + + if (dn->dn_type == ot && dn->dn_datablksz == blocksize && + dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) { + /* nothing is changing, this is a noop */ + dnode_rele(dn, FTAG); + return (0); + } + + if (bonustype == DMU_OT_SA) { + nblkptr = 1; + } else { + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + } + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + */ + if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) { + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); + if (err) + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + goto out; + } + + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + + dmu_tx_commit(tx); +out: + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); + + ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) +{ + uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + int error; + + error = dnode_next_offset(DMU_META_DNODE(os), + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c new file mode 100644 index 000000000000..7caebd979f02 --- /dev/null +++ b/uts/common/fs/zfs/dmu_objset.c @@ -0,0 +1,1789 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/cred.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/zvol.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dmu_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/sa.h> +#include <sys/zfs_onexit.h> + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +uint64_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +uint64_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); +} + +static void +copies_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval > 0); + ASSERT(newval <= spa_max_replication(os->os_spa)); + + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); + + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); +} + +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_secondary_cache = newval; +} + +static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } +} + +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_t **osp) +{ + objset_t *os; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { + uint32_t aflags = ARC_WAIT; + zbookmark_t zb; + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) + aflags |= ARC_L2CACHE; + + dprintf_bp(os->os_rootbp, "reading %s", ""); + /* + * XXX when bprewrite scrub can change the bp, + * and this is called from dmu_objset_open_ds_os, the bp + * could change, and we'll need a lock. + */ + err = dsl_read_nolock(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); + if (err) { + kmem_free(os, sizeof (objset_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; + return (err); + } + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_buf_alloc(spa, + sizeof (objset_phys_t), &os->os_phys_buf, + ARC_BUFC_METADATA); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + (void) arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf); + os->os_phys_buf = buf; + } + + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; + } else { + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + os->os_phys_buf = arc_buf_alloc(spa, size, + &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); + } + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. + */ + if (ds) { + err = dsl_prop_register(ds, "primarycache", + primary_cache_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "secondarycache", + secondary_cache_changed_cb, os); + if (!dsl_dataset_is_snapshot(ds)) { + if (err == 0) + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "copies", + copies_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "dedup", + dedup_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "logbias", + logbias_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "sync", + sync_changed_cb, os); + } + if (err) { + VERIFY(arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf) == 1); + kmem_free(os, sizeof (objset_t)); + return (err); + } + } else if (ds == NULL) { + /* It's the meta-objset. */ + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_LZJB; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = 0; + os->os_logbias = 0; + os->os_sync = 0; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; + } + + if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); + + for (i = 0; i < TXG_SIZE; i++) { + list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + list_create(&os->os_free_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + } + list_create(&os->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); + if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); + } + + /* + * We should be the only thread trying to do this because we + * have ds_opening_lock + */ + if (ds) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } + + *osp = os; + return (0); +} + +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) +{ + int err = 0; + + mutex_enter(&ds->ds_opening_lock); + *osp = ds->ds_objset; + if (*osp == NULL) { + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, dsl_dataset_get_blkptr(ds), osp); + } + mutex_exit(&ds->ds_opening_lock); + return (err); +} + +/* called from zpl */ +int +dmu_objset_hold(const char *name, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(name, tag, &ds); + if (err) + return (err); + + err = dmu_objset_from_ds(ds, osp); + if (err) + dsl_dataset_rele(ds, tag); + + return (err); +} + +/* called from zpl */ +int +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own(name, B_FALSE, tag, &ds); + if (err) + return (err); + + err = dmu_objset_from_ds(ds, osp); + if (err) { + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dmu_objset_disown(*osp, tag); + return (EINVAL); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dmu_objset_disown(*osp, tag); + return (EROFS); + } + return (err); +} + +void +dmu_objset_rele(objset_t *os, void *tag) +{ + dsl_dataset_rele(os->os_dsl_dataset, tag); +} + +void +dmu_objset_disown(objset_t *os, void *tag) +{ + dsl_dataset_disown(os->os_dsl_dataset, tag); +} + +int +dmu_objset_evict_dbufs(objset_t *os) +{ + dnode_t *dn; + + mutex_enter(&os->os_lock); + + /* process the mdn last, since the other dnodes have holds on it */ + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); + + /* + * Find the first dnode with holds. We have to do this dance + * because dnode_add_ref() only works if you already have a + * hold. If there are no holds then it has no dbufs so OK to + * skip. + */ + for (dn = list_head(&os->os_dnodes); + dn && !dnode_add_ref(dn, FTAG); + dn = list_next(&os->os_dnodes, dn)) + continue; + + while (dn) { + dnode_t *next_dn = dn; + + do { + next_dn = list_next(&os->os_dnodes, next_dn); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + + mutex_exit(&os->os_lock); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + mutex_enter(&os->os_lock); + dn = next_dn; + } + dn = list_head(&os->os_dnodes); + mutex_exit(&os->os_lock); + return (dn != DMU_META_DNODE(os)); +} + +void +dmu_objset_evict(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); + + if (ds) { + if (!dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "compression", + compression_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "copies", + copies_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "dedup", + dedup_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "logbias", + logbias_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "sync", + sync_changed_cb, os)); + } + VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + primary_cache_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + secondary_cache_changed_cb, os)); + } + + if (os->os_sa) + sa_tear_down(os); + + /* + * We should need only a single pass over the dnode list, since + * nothing can be added to the list at this point. + */ + (void) dmu_objset_evict_dbufs(os); + + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); + } + zil_free(os->os_zil); + + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); + + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + kmem_free(os, sizeof (objset_t)); +} + +timestruc_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); +} + +/* called from dsl for meta-objset */ +objset_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + objset_t *os; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + if (ds != NULL) + VERIFY(0 == dmu_objset_from_ds(ds, &os)); + else + VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); + + dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, + DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) { + int levels = 1; + + /* + * Determine the number of levels necessary for the meta-dnode + * to contain DN_MAX_OBJECT dnodes. + */ + while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT * sizeof (dnode_phys_t)) + levels++; + + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = levels; + } + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + os->os_phys->os_type = type; + if (dmu_objset_userused_enabled(os)) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; + } + + dsl_dataset_dirty(ds, tx); + + return (os); +} + +struct oscarg { + void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + void *userarg; + dsl_dataset_t *clone_origin; + const char *lastname; + dmu_objset_type_t type; + uint64_t flags; + cred_t *cr; +}; + +/*ARGSUSED*/ +static int +dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct oscarg *oa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + int err; + uint64_t ddobj; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + oa->lastname, sizeof (uint64_t), 1, &ddobj); + if (err != ENOENT) + return (err ? err : EEXIST); + + if (oa->clone_origin != NULL) { + /* You can't clone across pools. */ + if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(oa->clone_origin)) + return (EINVAL); + } + + return (0); +} + +static void +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + spa_t *spa = dd->dd_pool->dp_spa; + struct oscarg *oa = arg2; + uint64_t obj; + + ASSERT(dmu_tx_is_syncing(tx)); + + obj = dsl_dataset_create_sync(dd, oa->lastname, + oa->clone_origin, oa->flags, oa->cr, tx); + + if (oa->clone_origin == NULL) { + dsl_pool_t *dp = dd->dd_pool; + dsl_dataset_t *ds; + blkptr_t *bp; + objset_t *os; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + ASSERT(BP_IS_HOLE(bp)); + + os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); + + if (oa->userfunc) + oa->userfunc(os, oa->userarg, oa->cr, tx); + dsl_dataset_rele(ds, FTAG); + } + + spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) +{ + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; + + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); + } + + oa.userfunc = func; + oa.userarg = arg; + oa.lastname = tail; + oa.type = type; + oa.flags = flags; + oa.cr = CRED(); + + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); +} + +int +dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) +{ + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; + + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); + } + + oa.lastname = tail; + oa.clone_origin = clone_origin; + oa.flags = flags; + oa.cr = CRED(); + + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); +} + +int +dmu_objset_destroy(const char *name, boolean_t defer) +{ + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_destroy(ds, FTAG, defer); + /* dsl_dataset_destroy() closes the ds. */ + } + + return (error); +} + +struct snaparg { + dsl_sync_task_group_t *dstg; + char *snapname; + char *htag; + char failed[MAXPATHLEN]; + boolean_t recursive; + boolean_t needsuspend; + boolean_t temporary; + nvlist_t *props; + struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ + dsl_dataset_t *newds; +}; + +static int +snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct snaparg *sn = arg2; + int error; + + /* The props have already been checked by zfs_check_userprops(). */ + + error = dsl_dataset_snapshot_check(os->os_dsl_dataset, + sn->snapname, tx); + if (error) + return (error); + + if (sn->temporary) { + /* + * Ideally we would just call + * dsl_dataset_user_hold_check() and + * dsl_dataset_destroy_check() here. However the + * dataset we want to hold and destroy is the snapshot + * that we just confirmed we can create, but it won't + * exist until after these checks are run. Do any + * checks we can here and if more checks are added to + * those routines in the future, similar checks may be + * necessary here. + */ + if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + /* + * Not checking number of tags because the tag will be + * unique, as it will be the only tag. + */ + if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + sn->ha->temphold = B_TRUE; + sn->ha->htag = sn->htag; + } + return (error); +} + +static void +snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; + struct snaparg *sn = arg2; + + dsl_dataset_snapshot_sync(ds, sn->snapname, tx); + + if (sn->props) { + dsl_props_arg_t pa; + pa.pa_props = sn->props; + pa.pa_source = ZPROP_SRC_LOCAL; + dsl_props_set_sync(ds->ds_prev, &pa, tx); + } + + if (sn->temporary) { + struct dsl_ds_destroyarg da; + + dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); + kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); + sn->ha = NULL; + sn->newds = ds->ds_prev; + + da.ds = ds->ds_prev; + da.defer = B_TRUE; + dsl_dataset_destroy_sync(&da, FTAG, tx); + } +} + +static int +dmu_objset_snapshot_one(const char *name, void *arg) +{ + struct snaparg *sn = arg; + objset_t *os; + int err; + char *cp; + + /* + * If the objset starts with a '%', then ignore it unless it was + * explicitly named (ie, not recursive). These hidden datasets + * are always inconsistent, and by not opening them here, we can + * avoid a race with dsl_dir_destroy_check(). + */ + cp = strrchr(name, '/'); + if (cp && cp[1] == '%' && sn->recursive) + return (0); + + (void) strcpy(sn->failed, name); + + /* + * Check permissions if we are doing a recursive snapshot. The + * permission checks for the starting dataset have already been + * performed in zfs_secpolicy_snapshot() + */ + if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + return (err); + + err = dmu_objset_hold(name, sn, &os); + if (err != 0) + return (err); + + /* + * If the objset is in an inconsistent state (eg, in the process + * of being destroyed), don't snapshot it. As with %hidden + * datasets, we return EBUSY if this name was explicitly + * requested (ie, not recursive), and otherwise ignore it. + */ + if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { + dmu_objset_rele(os, sn); + return (sn->recursive ? 0 : EBUSY); + } + + if (sn->needsuspend) { + err = zil_suspend(dmu_objset_zil(os)); + if (err) { + dmu_objset_rele(os, sn); + return (err); + } + } + dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, + os, sn, 3); + + return (0); +} + +int +dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) +{ + dsl_sync_task_t *dst; + struct snaparg sn; + spa_t *spa; + minor_t minor; + int err; + + (void) strcpy(sn.failed, fsname); + + err = spa_open(fsname, &spa, FTAG); + if (err) + return (err); + + if (temporary) { + if (cleanup_fd < 0) { + spa_close(spa, FTAG); + return (EINVAL); + } + if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { + spa_close(spa, FTAG); + return (err); + } + } + + sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + sn.snapname = snapname; + sn.htag = tag; + sn.props = props; + sn.recursive = recursive; + sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + sn.temporary = temporary; + sn.ha = NULL; + sn.newds = NULL; + + if (recursive) { + err = dmu_objset_find(fsname, + dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); + } else { + err = dmu_objset_snapshot_one(fsname, &sn); + } + + if (err == 0) + err = dsl_sync_task_group_wait(sn.dstg); + + for (dst = list_head(&sn.dstg->dstg_tasks); dst; + dst = list_next(&sn.dstg->dstg_tasks, dst)) { + objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; + if (dst->dst_err) { + dsl_dataset_name(ds, sn.failed); + } else if (temporary) { + dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); + } + if (sn.needsuspend) + zil_resume(dmu_objset_zil(os)); + dmu_objset_rele(os, &sn); + } + + if (err) + (void) strcpy(fsname, sn.failed); + if (temporary) + zfs_onexit_fd_rele(cleanup_fd); + dsl_sync_task_group_destroy(sn.dstg); + spa_close(spa, FTAG); + return (err); +} + +static void +dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) +{ + dnode_t *dn; + + while (dn = list_head(list)) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); + + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + list_remove(list, dn); + + if (newlist) { + (void) dnode_add_ref(dn, newlist); + list_insert_tail(newlist, dn); + } + + dnode_sync(dn, tx); + } +} + +/* ARGSUSED */ +static void +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + objset_t *os = arg; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + + ASSERT(bp == os->os_rootbp); + ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); + ASSERT(BP_GET_LEVEL(bp) == 0); + + /* + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). + */ + bp->blk_fill = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill += dnp->dn_blkptr[i].blk_fill; +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } +} + +/* called from dsl */ +void +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) +{ + int txgoff; + zbookmark_t zb; + zio_prop_t zp; + zio_t *zio; + list_t *list; + list_t *newlist = NULL; + dbuf_dirty_record_t *dr; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + ASSERT(dmu_tx_is_syncing(tx)); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + if (os->os_dsl_dataset == NULL) { + /* + * This is the MOS. If we have upgraded, + * spa_max_replication() could change, so reset + * os_copies here. + */ + os->os_copies = spa_max_replication(os->os_spa); + } + + /* + * Create the root block IO + */ + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, + os->os_rootbp, os->os_spa, &zb)); + + dmu_write_policy(os, NULL, 0, 0, &zp); + + zio = arc_write(pio, os->os_spa, tx->tx_txg, + os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, + dmu_objset_write_ready, dmu_objset_write_done, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + /* + * Sync special dnodes - the parent IO for the sync is the root block + */ + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); + + os->os_phys->os_flags = os->os_flags; + + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); + } + + txgoff = tx->tx_txg & TXG_MASK; + + if (dmu_objset_userused_enabled(os)) { + newlist = &os->os_synced_dnodes; + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. + */ + list_create(newlist, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); + + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while (dr = list_head(list)) { + ASSERT(dr->dr_dbuf->db_level == 0); + list_remove(list, dr); + if (dr->dr_zio) + zio_nowait(dr->dr_zio); + } + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(zio); +} + +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || + !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); +} + +boolean_t +dmu_objset_is_dirty_anywhere(objset_t *os) +{ + for (int t = 0; t < TXG_SIZE; t++) + if (dmu_objset_is_dirty(os, t)) + return (B_TRUE); + return (B_FALSE); +} + +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +static void +do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) +{ + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_SIZE + used; + if (subtract) + delta = -delta; + VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, + user, delta, tx)); + VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, + group, delta, tx)); + } +} + +void +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +{ + dnode_t *dn; + list_t *list = &os->os_synced_dnodes; + + ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + + while (dn = list_head(list)) { + int flags; + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + /* Allocate the user/groupused objects if necessary. */ + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY(0 == zap_create_claim(os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + /* + * We intentionally modify the zap object even if the + * net delta is zero. Otherwise + * the block of the zap obj could be shared between + * datasets but need to be different between them after + * a bprewrite. + */ + + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE, tx); + } + + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + mutex_exit(&dn->dn_mtx); + + list_remove(list, dn); + dnode_rele(dn, list); + } +} + +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr, **drp; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg == tx->tx_txg) + break; + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t *user, *group; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + ASSERT(data); + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else if (data) { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os)) + return (ENOTSUP); + if (dmu_objset_is_snapshot(os)) + return (EINVAL); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +void +dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, + usedobjsp, availobjsp); +} + +uint64_t +dmu_objset_fsid_guid(objset_t *os) +{ + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); +} + +void +dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) +{ + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); +} + +void +dmu_objset_stats(objset_t *os, nvlist_t *nv) +{ + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); + + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, + os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + else + return (B_FALSE); +} + +int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + uint64_t ignored; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, + real, maxlen, conflict)); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +int +dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + zap_cursor_t cursor; + zap_attribute_t attr; + + /* there is no next dir on a snapshot! */ + if (os->os_dsl_dataset->ds_object != + dd->dd_phys->dd_head_dataset_obj) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (ENOENT); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (ENAMETOOLONG); + } + + (void) strcpy(name, attr.za_name); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +struct findarg { + int (*func)(const char *, void *); + void *arg; +}; + +/* ARGSUSED */ +static int +findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct findarg *fa = arg; + return (fa->func(dsname, fa->arg)); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * Perhaps change all callers to use dmu_objset_find_spa()? + */ +int +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) +{ + struct findarg fa; + fa.func = func; + fa.arg = arg; + return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); +} + +/* + * Find all objsets under name, call func on each + */ +int +dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + char *child; + uint64_t thisobj; + int err; + + if (name == NULL) + name = spa_name(spa); + err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); + if (err) + return (err); + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_close(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + dp = dd->dd_pool; + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_asprintf("%s/%s", name, attr->za_name); + err = dmu_objset_find_spa(spa, child, func, arg, flags); + strfree(child); + if (err) + break; + } + zap_cursor_fini(&zc); + + if (err) { + dsl_dir_close(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + if (!dsl_pool_sync_context(dp)) + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (!dsl_pool_sync_context(dp)) + rw_exit(&dp->dp_config_rwlock); + + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == + sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_asprintf("%s@%s", + name, attr->za_name); + err = func(spa, attr->za_first_integer, + child, arg); + strfree(child); + if (err) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_close(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err) + return (err); + + /* + * Apply to self if appropriate. + */ + err = func(spa, thisobj, name, arg); + return (err); +} + +/* ARGSUSED */ +int +dmu_objset_prefetch(const char *name, void *arg) +{ + dsl_dataset_t *ds; + + if (dsl_dataset_hold(name, FTAG, &ds)) + return (0); + + if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { + mutex_enter(&ds->ds_opening_lock); + if (ds->ds_objset == NULL) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), + &ds->ds_phys->ds_bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + mutex_exit(&ds->ds_opening_lock); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); +} diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c new file mode 100644 index 000000000000..e47d533a44f4 --- /dev/null +++ b/uts/common/fs/zfs/dmu_send.c @@ -0,0 +1,1606 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> +#include <zfs_fletcher.h> +#include <sys/avl.h> +#include <sys/ddt.h> +#include <sys/zfs_onexit.h> + +static char *dmu_recv_tag = "dmu_recv_tag"; + +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free() and + * dump_freeobjects() can be aggregated into a single DRR_FREE or + * DRR_FREEOBJECTS replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS +} pendop_t; + +struct backuparg { + dmu_replay_record_t *drr; + vnode_t *vp; + offset_t *off; + objset_t *os; + zio_cksum_t zc; + uint64_t toguid; + int err; + pendop_t pending_op; +}; + +static int +dump_bytes(struct backuparg *ba, void *buf, int len) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + ASSERT3U(len % 8, ==, 0); + + fletcher_4_incremental_native(buf, len, &ba->zc); + ba->err = vn_rdwr(UIO_WRITE, ba->vp, + (caddr_t)buf, len, + 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); + *ba->off += len; + return (ba->err); +} + +static int +dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_free *drrf = &(ba->drr->drr_u.drr_free); + + /* + * If there is a pending op, but it's not PENDING_FREE, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_FREE records can only be aggregated with + * other DRR_FREE records. DRR_FREEOBJECTS records can only be + * aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + if (ba->pending_op == PENDING_FREE) { + /* + * There should never be a PENDING_FREE if length is -1 + * (because dump_dnode is the only place where this + * function is called with a -1, and only after flushing + * any pending record). + */ + ASSERT(length != -1ULL); + /* + * Check to see whether this free block can be aggregated + * with pending one. + */ + if (drrf->drr_object == object && drrf->drr_offset + + drrf->drr_length == offset) { + drrf->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + /* create a FREE record and make it pending */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREE; + drrf->drr_object = object; + drrf->drr_offset = offset; + drrf->drr_length = length; + drrf->drr_toguid = ba->toguid; + if (length == -1ULL) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + } else { + ba->pending_op = PENDING_FREE; + } + + return (0); +} + +static int +dump_data(struct backuparg *ba, dmu_object_type_t type, + uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) +{ + struct drr_write *drrw = &(ba->drr->drr_u.drr_write); + + + /* + * If there is any kind of pending aggregation (currently either + * a grouping of free objects or free blocks), push it out to + * the stream, since aggregation can't be done across operations + * of different types. + */ + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + /* write a DATA record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_WRITE; + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = ba->toguid; + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + if (dump_bytes(ba, data, blksz) != 0) + return (EINTR); + return (0); +} + +static int +dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) +{ + struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); + + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = ba->toguid; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int +dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +{ + struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); + + /* + * If there is a pending op, but it's not PENDING_FREEOBJECTS, + * push it out, since free block aggregation can only be done for + * blocks of the same type (i.e., DRR_FREE records can only be + * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records + * can only be aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && + ba->pending_op != PENDING_FREEOBJECTS) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + if (ba->pending_op == PENDING_FREEOBJECTS) { + /* + * See whether this free object array can be aggregated + * with pending one + */ + if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { + drrfo->drr_numobjs += numobjs; + return (0); + } else { + /* can't be aggregated. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + + /* write a FREEOBJECTS record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREEOBJECTS; + drrfo->drr_firstobj = firstobj; + drrfo->drr_numobjs = numobjs; + drrfo->drr_toguid = ba->toguid; + + ba->pending_op = PENDING_FREEOBJECTS; + + return (0); +} + +static int +dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +{ + struct drr_object *drro = &(ba->drr->drr_u.drr_object); + + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(ba, object, 1)); + + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + /* write an OBJECT record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_OBJECT; + drro->drr_object = object; + drro->drr_type = dnp->dn_type; + drro->drr_bonustype = dnp->dn_bonustype; + drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_checksumtype = dnp->dn_checksum; + drro->drr_compress = dnp->dn_compress; + drro->drr_toguid = ba->toguid; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) + return (EINTR); + + /* free anything past the end of the file */ + if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + return (EINTR); + if (ba->err) + return (EINTR); + return (0); +} + +#define BP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct backuparg *ba = arg; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + return (0); + } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + } else if (bp == NULL) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + return (0); + } else if (type == DMU_OT_DNODE) { + dnode_phys_t *blk; + int i; + int blksz = BP_GET_LSIZE(bp); + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = dump_dnode(ba, dnobj, blk+i); + if (err) + break; + } + (void) arc_buf_remove_ref(abuf, &abuf); + } else if (type == DMU_OT_SA) { + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } else { /* it's a level-0 block of a regular object */ + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, + blksz, bp, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +int +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + vnode_t *vp, offset_t *off) +{ + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + dmu_replay_record_t *drr; + struct backuparg ba; + int err; + uint64_t fromtxg = 0; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + if (fromorigin) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, + DMU_SUBSTREAM); + +#ifdef _KERNEL + if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) + return (EINVAL); + if (version == ZPL_VERSION_SA) { + DMU_SET_FEATUREFLAGS( + drr->drr_u.drr_begin.drr_versioninfo, + DMU_BACKUP_FEATURE_SA_SPILL); + } + } +#endif + + drr->drr_u.drr_begin.drr_creation_time = + ds->ds_phys->ds_creation_time; + drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; + if (fromorigin) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; + drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + + if (fromds) + drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; + dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + + if (fromds) + fromtxg = fromds->ds_phys->ds_creation_txg; + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + + ba.drr = drr; + ba.vp = vp; + ba.os = tosnap; + ba.off = off; + ba.toguid = ds->ds_phys->ds_guid; + ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + ba.pending_op = PENDING_NONE; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, + backup_cb, &ba); + + if (ba.pending_op != PENDING_NONE) + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) + err = EINTR; + + if (err) { + if (err == EINTR && ba.err) + err = ba.err; + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (err); + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = ba.zc; + drr->drr_u.drr_end.drr_toguid = ba.toguid; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + kmem_free(drr, sizeof (dmu_replay_record_t)); + + return (0); +} + +struct recvbeginsyncarg { + const char *tofs; + const char *tosnap; + dsl_dataset_t *origin; + uint64_t fromguid; + dmu_objset_type_t type; + void *tag; + boolean_t force; + uint64_t dsflags; + char clonelastname[MAXNAMELEN]; + dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ + cred_t *cr; +}; + +/* ARGSUSED */ +static int +recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t val; + int err; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); + + if (err != ENOENT) + return (err ? err : EEXIST); + + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + if (!dsl_dataset_is_snapshot(rbsa->origin)) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + + return (0); +} + +static void +recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* Create and open new dataset. */ + dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, + rbsa->origin, flags, rbsa->cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, + B_TRUE, dmu_recv_tag, &rbsa->ds)); + + if (rbsa->origin == NULL) { + (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, + rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); + } + + spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, + dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); +} + +/* ARGSUSED */ +static int +recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + uint64_t val; + + /* must not have any changes since most recent snapshot */ + if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + if (rbsa->fromguid) { + /* if incremental, most recent snapshot must match fromguid */ + if (ds->ds_prev == NULL) + return (ENODEV); + + /* + * most recent snapshot must match fromguid, or there are no + * changes since the fromguid one + */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { + uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; + uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; + while (obj != 0) { + dsl_dataset_t *snap; + err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + obj, FTAG, &snap); + if (err) + return (ENODEV); + if (snap->ds_phys->ds_creation_txg < birth) { + dsl_dataset_rele(snap, FTAG); + return (ENODEV); + } + if (snap->ds_phys->ds_guid == rbsa->fromguid) { + dsl_dataset_rele(snap, FTAG); + break; /* it's ok */ + } + obj = snap->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (ENODEV); + } + } else { + /* if full, most recent snapshot must be $ORIGIN */ + if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) + return (ENODEV); + } + + /* temporary clone name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + return (0); +} + +/* ARGSUSED */ +static void +recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ohds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp = ohds->ds_dir->dd_pool; + dsl_dataset_t *cds; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* create and open the temporary clone */ + dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, + ohds->ds_prev, flags, rbsa->cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); + + /* + * If we actually created a non-clone, we need to create the + * objset in our new dataset. + */ + if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { + (void) dmu_objset_create_impl(dp->dp_spa, + cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); + } + + rbsa->ds = cds; + + spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, "dataset = %lld", dsobj); +} + +static boolean_t +dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) +{ + int featureflags; + + featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + + /* Verify pool version supports SA if SA_SPILL feature set */ + return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, + boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) +{ + int err = 0; + boolean_t byteswap; + struct recvbeginsyncarg rbsa = { 0 }; + uint64_t versioninfo; + int flags; + dsl_dataset_t *ds; + + if (drrb->drr_magic == DMU_BACKUP_MAGIC) + byteswap = FALSE; + else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + byteswap = TRUE; + else + return (EINVAL); + + rbsa.tofs = tofs; + rbsa.tosnap = tosnap; + rbsa.origin = origin ? origin->os_dsl_dataset : NULL; + rbsa.fromguid = drrb->drr_fromguid; + rbsa.type = drrb->drr_type; + rbsa.tag = FTAG; + rbsa.dsflags = 0; + rbsa.cr = CRED(); + versioninfo = drrb->drr_versioninfo; + flags = drrb->drr_flags; + + if (byteswap) { + rbsa.type = BSWAP_32(rbsa.type); + rbsa.fromguid = BSWAP_64(rbsa.fromguid); + versioninfo = BSWAP_64(versioninfo); + flags = BSWAP_32(flags); + } + + if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || + rbsa.type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && origin == NULL)) + return (EINVAL); + + if (flags & DRR_FLAG_CI_DATA) + rbsa.dsflags = DS_FLAG_CI_DATASET; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drrb = drrb; + drc->drc_tosnap = tosnap; + drc->drc_top_ds = top_ds; + drc->drc_force = force; + + /* + * Process the begin in syncing context. + */ + + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err == 0) { + if (dmu_recv_verify_features(ds, drrb)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (ENOTSUP); + } + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EINVAL); + } + + /* must not have an incremental recv already in progress */ + if (!mutex_tryenter(&ds->ds_recvlock)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EBUSY); + } + + /* tmp clone name is: tofs/%tosnap" */ + (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), + "%%%s", tosnap); + rbsa.force = force; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_existing_check, recv_existing_sync, ds, &rbsa, 5); + if (err) { + mutex_exit(&ds->ds_recvlock); + dsl_dataset_rele(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = ds; + drc->drc_real_ds = rbsa.ds; + } else if (err == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char *cp; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) + return (ENOENT); + + /* Open the parent of tofs */ + cp = strrchr(tofs, '/'); + *cp = '\0'; + err = dsl_dataset_hold(tofs, FTAG, &ds); + *cp = '/'; + if (err) + return (err); + + if (dmu_recv_verify_features(ds, drrb)) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; + drc->drc_newfs = B_TRUE; + } + + return (err); +} + +struct restorearg { + int err; + int byteswap; + vnode_t *vp; + char *buf; + uint64_t voff; + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t cksum; + avl_tree_t *guid_to_ds_map; +}; + +typedef struct guid_map_entry { + uint64_t guid; + dsl_dataset_t *gme_ds; + avl_node_t avlnode; +} guid_map_entry_t; + +static int +guid_compare(const void *arg1, const void *arg2) +{ + const guid_map_entry_t *gmep1 = arg1; + const guid_map_entry_t *gmep2 = arg2; + + if (gmep1->guid < gmep2->guid) + return (-1); + else if (gmep1->guid > gmep2->guid) + return (1); + return (0); +} + +/* + * This function is a callback used by dmu_objset_find() (which + * enumerates the object sets) to build an avl tree that maps guids + * to datasets. The resulting table is used when processing DRR_WRITE_BYREF + * send stream records. These records, which are used in dedup'ed + * streams, do not contain data themselves, but refer to a copy + * of the data block that has already been written because it was + * earlier in the stream. That previous copy is identified by the + * guid of the dataset with the referenced data. + */ +int +find_ds_by_guid(const char *name, void *arg) +{ + avl_tree_t *guid_map = arg; + dsl_dataset_t *ds, *snapds; + guid_map_entry_t *gmep; + dsl_pool_t *dp; + int err; + uint64_t lastobj, firstobj; + + if (dsl_dataset_hold(name, FTAG, &ds) != 0) + return (0); + + dp = ds->ds_dir->dd_pool; + rw_enter(&dp->dp_config_rwlock, RW_READER); + firstobj = ds->ds_dir->dd_phys->dd_origin_obj; + lastobj = ds->ds_phys->ds_prev_snap_obj; + + while (lastobj != firstobj) { + err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); + if (err) { + /* + * Skip this snapshot and move on. It's not + * clear why this would ever happen, but the + * remainder of the snapshot streadm can be + * processed. + */ + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + return (0); + } + + gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); + gmep->guid = snapds->ds_phys->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + lastobj = snapds->ds_phys->ds_prev_snap_obj; + } + + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + return (0); +} + +static void +free_guid_map_onexit(void *arg) +{ + avl_tree_t *ca = arg; + void *cookie = NULL; + guid_map_entry_t *gmep; + + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + dsl_dataset_rele(gmep->gme_ds, ca); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} + +static void * +restore_read(struct restorearg *ra, int len) +{ + void *rv; + int done = 0; + + /* some things will require 8-byte alignment, so everything must */ + ASSERT3U(len % 8, ==, 0); + + while (done < len) { + ssize_t resid; + + ra->err = vn_rdwr(UIO_READ, ra->vp, + (caddr_t)ra->buf + done, len - done, + ra->voff, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + + if (resid == len - done) + ra->err = EINVAL; + ra->voff += len - done - resid; + done = len - resid; + if (ra->err) + return (NULL); + } + + ASSERT3U(done, ==, len); + rv = ra->buf; + if (ra->byteswap) + fletcher_4_incremental_byteswap(rv, len, &ra->cksum); + else + fletcher_4_incremental_native(rv, len, &ra->cksum); + return (rv); +} + +static void +backup_byteswap(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_versioninfo); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + /* DO64(drr_object.drr_allocation_txg); */ + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + DO64(drr_object.drr_toguid); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_length); + DO64(drr_write.drr_toguid); + DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write.drr_key.ddk_prop); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write_byref.drr_key.ddk_prop); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + break; + case DRR_END: + DO64(drr_end.drr_checksum.zc_word[0]); + DO64(drr_end.drr_checksum.zc_word[1]); + DO64(drr_end.drr_checksum.zc_word[2]); + DO64(drr_end.drr_checksum.zc_word[3]); + DO64(drr_end.drr_toguid); + break; + } +#undef DO64 +#undef DO32 +} + +static int +restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +{ + int err; + dmu_tx_t *tx; + void *data = NULL; + + if (drro->drr_type == DMU_OT_NONE || + drro->drr_type >= DMU_OT_NUMTYPES || + drro->drr_bonustype >= DMU_OT_NUMTYPES || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_bonuslen > DN_MAX_BONUSLEN) { + return (EINVAL); + } + + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + + if (drro->drr_bonuslen) { + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); + if (ra->err) + return (ra->err); + } + + if (err == ENOENT) { + /* currently free, want to be allocated */ + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_claim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + dmu_tx_commit(tx); + } else { + /* currently allocated, want to be allocated */ + err = dmu_object_reclaim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen); + } + if (err) { + return (EINVAL); + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, drro->drr_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, + tx); + dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + + if (data != NULL) { + dmu_buf_t *db; + + VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + bcopy(data, db->db_data, drro->drr_bonuslen); + if (ra->byteswap) { + dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_freeobjects(struct restorearg *ra, objset_t *os, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (EINVAL); + + for (obj = drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs; + (void) dmu_object_next(os, &obj, FALSE, 0)) { + int err; + + if (dmu_object_info(os, obj, NULL) != 0) + continue; + + err = dmu_free_object(os, obj); + if (err) + return (err); + } + return (0); +} + +static int +restore_write(struct restorearg *ra, objset_t *os, + struct drr_write *drrw) +{ + dmu_tx_t *tx; + void *data; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + drrw->drr_type >= DMU_OT_NUMTYPES) + return (EINVAL); + + data = restore_read(ra, drrw->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrw->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + if (ra->byteswap) + dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + dmu_write(os, drrw->drr_object, + drrw->drr_offset, drrw->drr_length, data, tx); + dmu_tx_commit(tx); + return (0); +} + +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +static int +restore_write_byref(struct restorearg *ra, objset_t *os, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (EINVAL); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (EINVAL); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (EINVAL); + } else { + ref_os = os; + } + + if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) + return (err); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + dmu_write(os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + dmu_buf_rele(dbp, FTAG); + dmu_tx_commit(tx); + return (0); +} + +static int +restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) +{ + dmu_tx_t *tx; + void *data; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > SPA_MAXBLOCKSIZE) + return (EINVAL); + + data = restore_read(ra, drrs->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrs->drr_object, NULL) != 0) + return (EINVAL); + + VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + dmu_buf_will_dirty(db_spill, tx); + + if (db_spill->db_size < drrs->drr_length) + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + bcopy(data, db_spill->db_data, drrs->drr_length); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_free(struct restorearg *ra, objset_t *os, + struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (EINVAL); + + if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + return (EINVAL); + + err = dmu_free_long_range(os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + return (err); +} + +/* + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep) +{ + struct restorearg ra = { 0 }; + dmu_replay_record_t *drr; + objset_t *os; + zio_cksum_t pcksum; + int featureflags; + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + ra.byteswap = TRUE; + + { + /* compute checksum of drr_begin record */ + dmu_replay_record_t *drr; + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); + } + + if (ra.byteswap) { + struct drr_begin *drrb = drc->drc_drrb; + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + ra.vp = vp; + ra.voff = *voffp; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + + /* these were verified in dmu_recv_begin */ + ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + DMU_SUBSTREAM); + ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + + /* + * Open the objset we are modifying. + */ + VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); + + ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + + featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + /* if this stream is dedup'ed, set up the avl tree for guid mapping */ + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + minor_t minor; + + if (cleanup_fd == -1) { + ra.err = EBADF; + goto out; + } + ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (ra.err) { + cleanup_fd = -1; + goto out; + } + + if (*action_handlep == 0) { + ra.guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(ra.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, + (void *)ra.guid_to_ds_map, + DS_FIND_CHILDREN); + ra.err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, ra.guid_to_ds_map, + action_handlep); + if (ra.err) + goto out; + } else { + ra.err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&ra.guid_to_ds_map); + if (ra.err) + goto out; + } + } + + /* + * Read records and process them. + */ + pcksum = ra.cksum; + while (ra.err == 0 && + NULL != (drr = restore_read(&ra, sizeof (*drr)))) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + ra.err = EINTR; + goto out; + } + + if (ra.byteswap) + backup_byteswap(drr); + + switch (drr->drr_type) { + case DRR_OBJECT: + { + /* + * We need to make a copy of the record header, + * because restore_{object,write} may need to + * restore_read(), which will invalidate drr. + */ + struct drr_object drro = drr->drr_u.drr_object; + ra.err = restore_object(&ra, os, &drro); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects drrfo = + drr->drr_u.drr_freeobjects; + ra.err = restore_freeobjects(&ra, os, &drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write drrw = drr->drr_u.drr_write; + ra.err = restore_write(&ra, os, &drrw); + break; + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwbr = + drr->drr_u.drr_write_byref; + ra.err = restore_write_byref(&ra, os, &drrwbr); + break; + } + case DRR_FREE: + { + struct drr_free drrf = drr->drr_u.drr_free; + ra.err = restore_free(&ra, os, &drrf); + break; + } + case DRR_END: + { + struct drr_end drre = drr->drr_u.drr_end; + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) + ra.err = ECKSUM; + goto out; + } + case DRR_SPILL: + { + struct drr_spill drrs = drr->drr_u.drr_spill; + ra.err = restore_spill(&ra, os, &drrs); + break; + } + default: + ra.err = EINVAL; + goto out; + } + pcksum = ra.cksum; + } + ASSERT(ra.err != 0); + +out: + if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); + + if (ra.err != 0) { + /* + * destroy what we created, so we don't leave it in the + * inconsistent restoring state. + */ + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + if (drc->drc_real_ds != drc->drc_logical_ds) { + mutex_exit(&drc->drc_logical_ds->ds_recvlock); + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } + } + + kmem_free(ra.buf, ra.bufsize); + *voffp = ra.voff; + return (ra.err); +} + +struct recvendsyncarg { + char *tosnap; + uint64_t creation_time; + uint64_t toguid; +}; + +static int +recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); +} + +static void +recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; + ds->ds_prev->ds_phys->ds_guid = resa->toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); + if (err) + goto out; + } else { + mutex_exit(&ds->ds_recvlock); + dsl_dataset_rele(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + return (EBUSY); + } + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + /* swap back */ + (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); + } + +out: + mutex_exit(&ds->ds_recvlock); + dsl_dataset_disown(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + return (err); +} + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + /* clean up the fs we just recv'd into */ + (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); + } else { + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + } + return (err); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + if (drc->drc_logical_ds != drc->drc_real_ds) + return (dmu_recv_existing_end(drc)); + else + return (dmu_recv_new_end(drc)); +} diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c new file mode 100644 index 000000000000..023f90e12e34 --- /dev/null +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -0,0 +1,482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/callb.h> + +int zfs_pd_blks_max = 100; + +typedef struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int pd_blks_max; + int pd_blks_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; +} prefetch_data_t; + +typedef struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + int td_flags; + prefetch_data_t *td_pfd; + blkptr_cb_t *td_func; + void *td_arg; +} traverse_data_t; + +static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object); + +static int +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return (0); + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) + return (0); + + SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); + + return (0); +} + +static int +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return (0); + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, + td->td_arg); + } + return (0); +} + +static void +traverse_zil(traverse_data_t *td, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed; plus, in read-only mode, blocks that are already stable. + */ + if (claim_txg == 0 && spa_writeable(td->td_spa)) + return; + + zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, + claim_txg); + + zil_free(zilog); +} + +static int +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + zbookmark_t czb; + int err = 0, lasterr = 0; + arc_buf_t *buf = NULL; + prefetch_data_t *pd = td->td_pfd; + boolean_t hard = td->td_flags & TRAVERSE_HARD; + + if (bp->blk_birth == 0) { + err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, + td->td_arg); + return (err); + } + + if (bp->blk_birth <= td->td_min_txg) + return (0); + + if (pd && !pd->pd_exited && + ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_blks_fetched >= 0); + while (pd->pd_blks_fetched == 0 && !pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + pd->pd_blks_fetched--; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err) + return (err); + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = dsl_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, buf, cbp, &czb); + if (err) { + if (!hard) + break; + lasterr = err; + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = dsl_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + dnp = buf->b_data; + for (i = 0; i < epb; i++, dnp++) { + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); + if (err) { + if (!hard) + break; + lasterr = err; + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + dnode_phys_t *dnp; + + err = dsl_read_nolock(NULL, td->td_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + osp = buf->b_data; + dnp = &osp->os_meta_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (err && hard) { + lasterr = err; + err = 0; + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_userused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + if (err && hard) { + lasterr = err; + err = 0; + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_groupused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + } + } + + if (buf) + (void) arc_buf_remove_ref(buf, &buf); + + if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + } + + return (err != 0 ? err : lasterr); +} + +static int +traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object) +{ + int j, err = 0, lasterr = 0; + zbookmark_t czb; + boolean_t hard = (td->td_flags & TRAVERSE_HARD); + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) { + if (!hard) + break; + lasterr = err; + } + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, + object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_spill, &czb); + if (err) { + if (!hard) + return (err); + lasterr = err; + } + } + return (err != 0 ? err : lasterr); +} + +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, + void *arg) +{ + prefetch_data_t *pfd = arg; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + + ASSERT(pfd->pd_blks_fetched >= 0); + if (pfd->pd_cancel) + return (EINTR); + + if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + return (0); + + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) + cv_wait(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_blks_fetched++; + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); + + (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, zb); + + return (0); +} + +static void +traverse_prefetch_thread(void *arg) +{ + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; + zbookmark_t czb; + + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; + + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); + + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +{ + traverse_data_t td; + prefetch_data_t pd = { 0 }; + zbookmark_t czb; + int err; + + td.td_spa = spa; + td.td_objset = ds ? ds->ds_object : 0; + td.td_rootbp = rootbp; + td.td_min_txg = txg_start; + td.td_func = func; + td.td_arg = arg; + td.td_pfd = &pd; + td.td_flags = flags; + + pd.pd_blks_max = zfs_pd_blks_max; + pd.pd_flags = flags; + mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { + objset_t *os; + + err = dmu_objset_from_ds(ds, &os); + if (err) + return (err); + + traverse_zil(&td, &os->os_zil_header); + } + + if (!(flags & TRAVERSE_PREFETCH) || + 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, + &td, TQ_NOQUEUE)) + pd.pd_exited = B_TRUE; + + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + + mutex_enter(&pd.pd_mtx); + pd.pd_cancel = B_TRUE; + cv_broadcast(&pd.pd_cv); + while (!pd.pd_exited) + cv_wait(&pd.pd_cv, &pd.pd_mtx); + mutex_exit(&pd.pd_mtx); + + mutex_destroy(&pd.pd_mtx); + cv_destroy(&pd.pd_cv); + + return (err); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, + &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); +} + +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + int err, lasterr = 0; + uint64_t obj; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + boolean_t hard = (flags & TRAVERSE_HARD); + + /* visit the MOS */ + err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), + txg_start, flags, func, arg); + if (err) + return (err); + + /* visit each dataset */ + for (obj = 1; err == 0 || (err != ESRCH && hard); + err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } + + if (doi.doi_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + uint64_t txg = txg_start; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } + if (ds->ds_phys->ds_prev_snap_txg > txg) + txg = ds->ds_phys->ds_prev_snap_txg; + err = traverse_dataset(ds, txg, flags, func, arg); + dsl_dataset_rele(ds, FTAG); + if (err) { + if (!hard) + return (err); + lasterr = err; + } + } + } + if (err == ESRCH) + err = 0; + return (err != 0 ? err : lasterr); +} diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c new file mode 100644 index 000000000000..bd5c71a2265e --- /dev/null +++ b/uts/common/fs/zfs/dmu_tx.c @@ -0,0 +1,1382 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ +#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_pool.h> +#include <sys/zap_impl.h> /* for fzap_default_block_shift */ +#include <sys/spa.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/zfs_context.h> +#include <sys/varargs.h> + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + + +dmu_tx_t * +dmu_tx_create_dd(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); +#ifdef ZFS_DEBUG + refcount_create(&tx->tx_space_written); + refcount_create(&tx->tx_space_freed); +#endif + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_dd(NULL); + + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *txh; + dnode_t *dn = NULL; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os, object, tx, &dn); + if (err) { + tx->tx_err = err; + return (NULL); + } + + if (err == 0 && tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + dn->dn_assigned_txg = tx->tx_txg; + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh->txh_tx = tx; + txh->txh_dnode = dn; +#ifdef ZFS_DEBUG + txh->txh_type = type; + txh->txh_arg1 = arg1; + txh->txh_arg2 = arg2; +#endif + list_insert_tail(&tx->tx_holds, txh); + + return (txh); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) { + (void) dmu_tx_hold_object_impl(tx, os, + object, THT_NEWOBJECT, 0, 0); + } +} + +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + dbuf_rele(db, FTAG); + return (err); +} + +static void +dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, + int level, uint64_t blkid, boolean_t freeable, uint64_t *history) +{ + objset_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent = NULL; + blkptr_t *bp = NULL; + uint64_t space; + + if (level >= dn->dn_nlevels || history[level] == blkid) + return; + + history[level] = blkid; + + space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); + + if (db == NULL || db == dn->dn_dbuf) { + ASSERT(level != 0); + db = NULL; + } else { + ASSERT(DB_DNODE(db) == dn); + ASSERT(db->db_level == level); + ASSERT(db->db.db_size == space); + ASSERT(db->db_blkid == blkid); + bp = db->db_blkptr; + parent = db->db_parent; + } + + freeable = (bp && (freeable || + dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); + + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (bp) + txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + + dmu_tx_count_twig(txh, dn, parent, level + 1, + blkid >> epbs, freeable, history); +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + uint64_t start, end, i; + int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; + int err = 0; + + if (len == 0) + return; + + min_bs = SPA_MINBLOCKSHIFT; + max_bs = SPA_MAXBLOCKSHIFT; + min_ibs = DN_MIN_INDBLKSHIFT; + max_ibs = DN_MAX_INDBLKSHIFT; + + if (dn) { + uint64_t history[DN_MAX_LEVELS]; + int nlvls = dn->dn_nlevels; + int delta; + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ + if (dn->dn_maxblkid == 0) { + delta = dn->dn_datablksz; + start = (off < dn->dn_datablksz) ? 0 : 1; + end = (off+len <= dn->dn_datablksz) ? 0 : 1; + if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) + goto out; + delta -= off; + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || + len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err) + goto out; + } + + /* last level-0 block */ + end = (off+len-1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off+len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err) + goto out; + } + + /* level-1 blocks */ + if (nlvls > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = (start>>shft)+1; i < end>>shft; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) + goto out; + } + } + + err = zio_wait(zio); + if (err) + goto out; + delta = P2NPHASE(off, dn->dn_datablksz); + } + + if (dn->dn_maxblkid > 0) { + /* + * The blocksize can't change, + * so we can make a more precise estimate. + */ + ASSERT(dn->dn_datablkshift != 0); + min_bs = max_bs = dn->dn_datablkshift; + min_ibs = max_ibs = dn->dn_indblkshift; + } else if (dn->dn_indblkshift > max_ibs) { + /* + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on older pools. + */ + min_ibs = max_ibs = dn->dn_indblkshift; + } + + /* + * If this write is not off the end of the file + * we need to account for overwrites/unref. + */ + if (start <= dn->dn_maxblkid) { + for (int l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } + while (start <= dn->dn_maxblkid) { + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + + if (err) { + txh->txh_tx->tx_err = err; + return; + } + + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, + history); + dbuf_rele(db, FTAG); + if (++start > end) { + /* + * Account for new indirects appearing + * before this IO gets assigned into a txg. + */ + bits = 64 - min_bs; + epbs = min_ibs - SPA_BLKPTRSHIFT; + for (bits -= epbs * (nlvls - 1); + bits >= 0; bits -= epbs) + txh->txh_fudge += 1ULL << max_ibs; + goto out; + } + off += delta; + if (len >= delta) + len -= delta; + delta = dn->dn_datablksz; + } + } + + /* + * 'end' is the last thing we will access, not one past. + * This way we won't overflow when accessing the last byte. + */ + start = P2ALIGN(off, 1ULL << max_bs); + end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; + txh->txh_space_towrite += end - start + 1; + + start >>= min_bs; + end >>= min_bs; + + epbs = min_ibs - SPA_BLKPTRSHIFT; + + /* + * The object contains at most 2^(64 - min_bs) blocks, + * and each indirect level maps 2^epbs. + */ + for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { + start >>= epbs; + end >>= epbs; + ASSERT3U(end, >=, start); + txh->txh_space_towrite += (end - start + 1) << max_ibs; + if (start != 0) { + /* + * We also need a new blkid=0 indirect block + * to reference any existing file data. + */ + txh->txh_space_towrite += 1ULL << max_ibs; + } + } + +out: + if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + 2 * DMU_MAX_ACCESS) + err = EFBIG; + + if (err) + txh->txh_tx->tx_err = err; +} + +static void +dmu_tx_count_dnode(dmu_tx_hold_t *txh) +{ + dnode_t *dn = txh->txh_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); + uint64_t space = mdn->dn_datablksz + + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); + + if (dn && dn->dn_dbuf->db_blkptr && + dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { + txh->txh_space_tooverwrite += space; + txh->txh_space_tounref += space; + } else { + txh->txh_space_towrite += space; + if (dn && dn->dn_dbuf->db_blkptr) + txh->txh_space_tounref += space; + } +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + ASSERT(len < DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, off, len); + if (txh == NULL) + return; + + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); +} + +static void +dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + uint64_t blkid, nblks, lastblk; + uint64_t space = 0, unref = 0, skipped = 0; + dnode_t *dn = txh->txh_dnode; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + spa_t *spa = txh->txh_tx->tx_pool->dp_spa; + int epbs; + + if (dn->dn_nlevels == 0) + return; + + /* + * The struct_rwlock protects us against dn_nlevels + * changing, in case (against all odds) we manage to dirty & + * sync out the changes after we check for being dirty. + * Also, dbuf_hold_impl() wants us to have the struct_rwlock. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_maxblkid == 0) { + if (off == 0 && len >= dn->dn_datablksz) { + blkid = 0; + nblks = 1; + } else { + rw_exit(&dn->dn_struct_rwlock); + return; + } + } else { + blkid = off >> dn->dn_datablkshift; + nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; + + if (blkid >= dn->dn_maxblkid) { + rw_exit(&dn->dn_struct_rwlock); + return; + } + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; + + } + if (dn->dn_nlevels == 1) { + int i; + for (i = 0; i < nblks; i++) { + blkptr_t *bp = dn->dn_phys->dn_blkptr; + ASSERT3U(blkid + i, <, dn->dn_nblkptr); + bp += blkid + i; + if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { + dprintf_bp(bp, "can free old%s", ""); + space += bp_get_dsize(spa, bp); + } + unref += BP_GET_ASIZE(bp); + } + nblks = 0; + } + + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels. + */ + { + uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); + int level = (dn->dn_nlevels > 1) ? 2 : 1; + + while (level++ < DN_MAX_LEVELS) { + txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + ASSERT(blkcnt <= dn->dn_nblkptr); + } + + lastblk = blkid + nblks - 1; + while (nblks) { + dmu_buf_impl_t *dbuf; + uint64_t ibyte, new_blkid; + int epb = 1 << epbs; + int err, i, blkoff, tochk; + blkptr_t *bp; + + ibyte = blkid << dn->dn_datablkshift; + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); + new_blkid = ibyte >> dn->dn_datablkshift; + if (err == ESRCH) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + if (err) { + txh->txh_tx->tx_err = err; + break; + } + if (new_blkid > lastblk) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + + if (new_blkid > blkid) { + ASSERT((new_blkid >> epbs) > (blkid >> epbs)); + skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; + nblks -= new_blkid - blkid; + blkid = new_blkid; + } + blkoff = P2PHASE(blkid, epb); + tochk = MIN(epb - blkoff, nblks); + + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + if (err) { + txh->txh_tx->tx_err = err; + break; + } + + txh->txh_memory_tohold += dbuf->db.db_size; + + /* + * We don't check memory_tohold against DMU_MAX_ACCESS because + * memory_tohold is an over-estimation (especially the >L1 + * indirect blocks), so it could fail. Callers should have + * already verified that they will not be holding too much + * memory. + */ + + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { + txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); + break; + } + + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, &bp[i], + bp[i].blk_birth)) { + dprintf_bp(&bp[i], "can free old%s", ""); + space += bp_get_dsize(spa, &bp[i]); + } + unref += BP_GET_ASIZE(bp); + } + dbuf_rele(dbuf, FTAG); + + blkid += tochk; + nblks -= tochk; + } + rw_exit(&dn->dn_struct_rwlock); + + /* account for new level 1 indirect blocks that might show up */ + if (skipped > 0) { + txh->txh_fudge += skipped << dn->dn_indblkshift; + skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); + txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + } + txh->txh_space_tofree += space; + txh->txh_space_tounref += unref; +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t start, end, i; + int err, shift; + zio_t *zio; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + /* first block */ + if (off != 0) + dmu_tx_count_write(txh, off, 1); + /* last block */ + if (len != DMU_OBJECT_END) + dmu_tx_count_write(txh, off+len, 1); + + dmu_tx_count_dnode(txh); + + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + + /* + * For i/o error checking, read the first and last level-0 + * blocks, and all the level-1 blocks. The above count_write's + * have already taken care of the level-0 blocks. + */ + if (dn->dn_nlevels > 1) { + shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + start = off >> shift; + end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + + zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (i = start; i <= end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (err == ESRCH) + break; + if (err) { + tx->tx_err = err; + return; + } + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) { + tx->tx_err = err; + return; + } + } + err = zio_wait(zio); + if (err) { + tx->tx_err = err; + return; + } + } + + dmu_tx_count_free(txh, off, len); +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +{ + dmu_tx_hold_t *txh; + dnode_t *dn; + uint64_t nblocks; + int epbs, err; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh == NULL) + return; + dn = txh->txh_dnode; + + dmu_tx_count_dnode(txh); + + if (dn == NULL) { + /* + * We will be able to fit a new object's entries into one leaf + * block. So there will be at most 2 blocks total, + * including the header block. + */ + dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); + return; + } + + ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + + if (dn->dn_maxblkid == 0 && !add) { + /* + * If there is only one block (i.e. this is a micro-zap) + * and we are not adding anything, the accounting is simple. + */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) { + tx->tx_err = err; + return; + } + + /* + * Use max block size here, since we don't know how much + * the size will change between now and the dbuf dirty call. + */ + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + &dn->dn_phys->dn_blkptr[0], + dn->dn_phys->dn_blkptr[0].blk_birth)) { + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + } else { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + } + if (dn->dn_phys->dn_blkptr[0].blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + return; + } + + if (dn->dn_maxblkid > 0 && name) { + /* + * access the name in this fat-zap so that we'll check + * for i/o errors to the leaf blocks, etc. + */ + err = zap_lookup(dn->dn_objset, dn->dn_object, name, + 8, 0, NULL); + if (err == EIO) { + tx->tx_err = err; + return; + } + } + + err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); + + /* + * If the modified blocks are scattered to the four winds, + * we'll have to modify an indirect twig for each. + */ + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) + if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; + else + txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_SPACE, space, 0); + + txh->txh_space_towrite += space; +} + +int +dmu_tx_holds(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + int holds = 0; + + /* + * By asserting that the tx is assigned, we're counting the + * number of dn_tx_holds, which is the same as the number of + * dn_holds. Otherwise, we'd be counting dn_holds, but + * dn_tx_holds could be 0. + */ + ASSERT(tx->tx_txg != 0); + + /* if (tx->tx_anyobj == TRUE) */ + /* return (0); */ + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + if (txh->txh_dnode && txh->txh_dnode->dn_object == object) + holds++; + } + + return (holds); +} + +#ifdef ZFS_DEBUG +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ + dmu_tx_hold_t *txh; + int match_object = FALSE, match_offset = FALSE; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); + return; + } + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); + return; + } + + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) + match_object = TRUE; + if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (txh->txh_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX txh_arg2 better not be zero... */ + + dprintf("found txh type %x beginblk=%llx endblk=%llx\n", + txh->txh_type, beginblk, endblk); + + switch (txh->txh_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * or spill buffer so that we don't need to + * hold it when creating a new object. + */ + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DMU_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + ASSERT(!"bad txh_type"); + } + } + if (match_object && match_offset) { + DB_DNODE_EXIT(db); + return; + } + } + DB_DNODE_EXIT(db); + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +} +#endif + +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + dmu_tx_hold_t *txh; + spa_t *spa = tx->tx_pool->dp_spa; + uint64_t memory, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; + + ASSERT3U(tx->tx_txg, ==, 0); + + if (tx->tx_err) + return (tx->tx_err); + + if (spa_suspended(spa)) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + txg_how != TXG_WAIT) + return (EIO); + + return (ERESTART); + } + + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + tx->tx_needassign_txh = NULL; + + /* + * NB: No error returns are allowed after txg_hold_open, but + * before processing the dnode holds, due to the + * dmu_tx_unassign() logic. + */ + + towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; + for (txh = list_head(&tx->tx_holds); txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn != NULL) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_assigned_txg == tx->tx_txg - 1) { + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = txh; + return (ERESTART); + } + if (dn->dn_assigned_txg == 0) + dn->dn_assigned_txg = tx->tx_txg; + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + towrite += txh->txh_space_towrite; + tofree += txh->txh_space_tofree; + tooverwrite += txh->txh_space_tooverwrite; + tounref += txh->txh_space_tounref; + tohold += txh->txh_memory_tohold; + fudge += txh->txh_fudge; + } + + /* + * NB: This check must be after we've held the dnodes, so that + * the dmu_tx_unassign() logic will work properly + */ + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + return (ERESTART); + + /* + * If a snapshot has been taken since we made our estimates, + * assume that we won't be able to free or overwrite anything. + */ + if (tx->tx_objset && + dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > + tx->tx_lastsnap_txg) { + towrite += tooverwrite; + tooverwrite = tofree = 0; + } + + /* needed allocation: worst-case estimate of write space */ + asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); + /* freed space estimate: worst-case overwrite + free estimate */ + fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; + /* convert unrefd space to worst-case estimate */ + usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + /* calculate memory footprint estimate */ + memory = towrite + tooverwrite + tohold; + +#ifdef ZFS_DEBUG + /* + * Add in 'tohold' to account for our dirty holds on this memory + * XXX - the "fudge" factor is to account for skipped blocks that + * we missed because dnode_next_offset() misses in-core-only blocks. + */ + tx->tx_space_towrite = asize + + spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); + tx->tx_space_tofree = tofree; + tx->tx_space_tooverwrite = tooverwrite; + tx->tx_space_tounref = tounref; +#endif + + if (tx->tx_dir && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); + if (err) + return (err); + } + + return (0); +} + +static void +dmu_tx_unassign(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + if (tx->tx_txg == 0) + return; + + txg_rele_to_quiesce(&tx->tx_txgh); + + for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_lasttried_txg = tx->tx_txg; + tx->tx_txg = 0; +} + +/* + * Assign tx to a transaction group. txg_how can be one of: + * + * (1) TXG_WAIT. If the current open txg is full, waits until there's + * a new one. This should be used when you're not holding locks. + * If will only fail if we're truly out of space (or over quota). + * + * (2) TXG_NOWAIT. If we can't assign into the current open txg without + * blocking, returns immediately with ERESTART. This should be used + * whenever you're holding locks. On an ERESTART error, the caller + * should drop locks, do a dmu_tx_wait(tx), and try again. + * + * (3) A specific txg. Use this if you need to ensure that multiple + * transactions all sync in the same txg. Like TXG_NOWAIT, it + * returns ERESTART if it can't assign you into the requested txg. + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT(txg_how != 0); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + dmu_tx_unassign(tx); + + if (err != ERESTART || txg_how != TXG_WAIT) + return (err); + + dmu_tx_wait(tx); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_wait(dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + + ASSERT(tx->tx_txg == 0); + + /* + * It's possible that the pool has become active after this thread + * has tried to obtain a tx. If that's the case then his + * tx_lasttried_txg would not have been assigned. + */ + if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { + dnode_t *dn = tx->tx_needassign_txh->txh_dnode; + + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = NULL; + } else { + txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + } +} + +void +dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) +{ +#ifdef ZFS_DEBUG + if (tx->tx_dir == NULL || delta == 0) + return; + + if (delta > 0) { + ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, + tx->tx_space_towrite); + (void) refcount_add_many(&tx->tx_space_written, delta, NULL); + } else { + (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); + } +#endif +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg != 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + dnode_rele(dn, tx); + } + + if (tx->tx_tempreserve_cookie) + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); + list_destroy(&tx->tx_holds); +#ifdef ZFS_DEBUG + dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", + tx->tx_space_towrite, refcount_count(&tx->tx_space_written), + tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + while (txh = list_head(&tx->tx_holds)) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); + list_destroy(&tx->tx_holds); +#ifdef ZFS_DEBUG + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while (dcb = list_head(cb_list)) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + blkptr_t *bp; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + bp = &dn->dn_phys->dn_spill; + if (BP_IS_HOLE(bp)) { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref = 0; + } else { + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp, bp->blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + if (bp->blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} diff --git a/uts/common/fs/zfs/dmu_zfetch.c b/uts/common/fs/zfs/dmu_zfetch.c new file mode 100644 index 000000000000..37037c30f623 --- /dev/null +++ b/uts/common/fs/zfs/dmu_zfetch.c @@ -0,0 +1,724 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_zfetch.h> +#include <sys/dmu.h> +#include <sys/dbuf.h> +#include <sys/kstat.h> + +/* + * I'm against tune-ables, but these should probably exist as tweakable globals + * until we can get this working the way we want it to. + */ + +int zfs_prefetch_disable = 0; + +/* max # of streams per zfetch */ +uint32_t zfetch_max_streams = 8; +/* min time before stream reclaim */ +uint32_t zfetch_min_sec_reap = 2; +/* max number of blocks to fetch at a time */ +uint32_t zfetch_block_cap = 256; +/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +uint64_t zfetch_array_rd_sz = 1024 * 1024; + +/* forward decls for static routines */ +static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); +static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); +static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); +static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); +static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); +static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); +static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); + +typedef struct zfetch_stats { + kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_misses; + kstat_named_t zfetchstat_colinear_hits; + kstat_named_t zfetchstat_colinear_misses; + kstat_named_t zfetchstat_stride_hits; + kstat_named_t zfetchstat_stride_misses; + kstat_named_t zfetchstat_reclaim_successes; + kstat_named_t zfetchstat_reclaim_failures; + kstat_named_t zfetchstat_stream_resets; + kstat_named_t zfetchstat_stream_noresets; + kstat_named_t zfetchstat_bogus_streams; +} zfetch_stats_t; + +static zfetch_stats_t zfetch_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "colinear_hits", KSTAT_DATA_UINT64 }, + { "colinear_misses", KSTAT_DATA_UINT64 }, + { "stride_hits", KSTAT_DATA_UINT64 }, + { "stride_misses", KSTAT_DATA_UINT64 }, + { "reclaim_successes", KSTAT_DATA_UINT64 }, + { "reclaim_failures", KSTAT_DATA_UINT64 }, + { "streams_resets", KSTAT_DATA_UINT64 }, + { "streams_noresets", KSTAT_DATA_UINT64 }, + { "bogus_streams", KSTAT_DATA_UINT64 }, +}; + +#define ZFETCHSTAT_INCR(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); + +#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); + +kstat_t *zfetch_ksp; + +/* + * Given a zfetch structure and a zstream structure, determine whether the + * blocks to be read are part of a co-linear pair of existing prefetch + * streams. If a set is found, coalesce the streams, removing one, and + * configure the prefetch so it looks for a strided access pattern. + * + * In other words: if we find two sequential access streams that are + * the same length and distance N appart, and this read is N from the + * last stream, then we are probably in a strided access pattern. So + * combine the two sequential streams into a single strided stream. + * + * If no co-linear streams are found, return NULL. + */ +static int +dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) +{ + zstream_t *z_walk; + zstream_t *z_comp; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + if (zh == NULL) { + rw_exit(&zf->zf_rwlock); + return (0); + } + + for (z_walk = list_head(&zf->zf_stream); z_walk; + z_walk = list_next(&zf->zf_stream, z_walk)) { + for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; + z_comp = list_next(&zf->zf_stream, z_comp)) { + int64_t diff; + + if (z_walk->zst_len != z_walk->zst_stride || + z_comp->zst_len != z_comp->zst_stride) { + continue; + } + + diff = z_comp->zst_offset - z_walk->zst_offset; + if (z_comp->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + + diff = z_walk->zst_offset - z_comp->zst_offset; + if (z_walk->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + } + } + + rw_exit(&zf->zf_rwlock); + return (0); +} + +/* + * Given a zstream_t, determine the bounds of the prefetch. Then call the + * routine that actually prefetches the individual blocks. + */ +static void +dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) +{ + uint64_t prefetch_tail; + uint64_t prefetch_limit; + uint64_t prefetch_ofst; + uint64_t prefetch_len; + uint64_t blocks_fetched; + + zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); + zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); + + prefetch_tail = MAX((int64_t)zs->zst_ph_offset, + (int64_t)(zs->zst_offset + zs->zst_stride)); + /* + * XXX: use a faster division method? + */ + prefetch_limit = zs->zst_offset + zs->zst_len + + (zs->zst_cap * zs->zst_stride) / zs->zst_len; + + while (prefetch_tail < prefetch_limit) { + prefetch_ofst = zs->zst_offset + zs->zst_direction * + (prefetch_tail - zs->zst_offset); + + prefetch_len = zs->zst_len; + + /* + * Don't prefetch beyond the end of the file, if working + * backwards. + */ + if ((zs->zst_direction == ZFETCH_BACKWARD) && + (prefetch_ofst > prefetch_tail)) { + prefetch_len += prefetch_ofst; + prefetch_ofst = 0; + } + + /* don't prefetch more than we're supposed to */ + if (prefetch_len > zs->zst_len) + break; + + blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, + prefetch_ofst, zs->zst_len); + + prefetch_tail += zs->zst_stride; + /* stop if we've run out of stuff to prefetch */ + if (blocks_fetched < zs->zst_len) + break; + } + zs->zst_ph_offset = prefetch_tail; + zs->zst_last = ddi_get_lbolt(); +} + +void +zfetch_init(void) +{ + + zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", + KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zfetch_ksp != NULL) { + zfetch_ksp->ks_data = &zfetch_stats; + kstat_install(zfetch_ksp); + } +} + +void +zfetch_fini(void) +{ + if (zfetch_ksp != NULL) { + kstat_delete(zfetch_ksp); + zfetch_ksp = NULL; + } +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) { + return; + } + + zf->zf_dnode = dno; + zf->zf_stream_cnt = 0; + zf->zf_alloc_fail = 0; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zst_node)); + + rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); +} + +/* + * This function computes the actual size, in blocks, that can be prefetched, + * and fetches it. + */ +static uint64_t +dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + uint64_t i; + + fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); + + for (i = 0; i < fetchsz; i++) { + dbuf_prefetch(dn, blkid + i); + } + + return (fetchsz); +} + +/* + * this function returns the number of blocks that would be prefetched, based + * upon the supplied dnode, blockid, and nblks. This is used so that we can + * update streams in place, and then prefetch with their old value after the + * fact. This way, we can delay the prefetch, but subsequent accesses to the + * stream won't result in the same data being prefetched multiple times. + */ +static uint64_t +dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + + if (blkid > dn->dn_maxblkid) { + return (0); + } + + /* compute fetch size */ + if (blkid + nblks + 1 > dn->dn_maxblkid) { + fetchsz = (dn->dn_maxblkid - blkid) + 1; + ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); + } else { + fetchsz = nblks; + } + + + return (fetchsz); +} + +/* + * given a zfetch and a zstream structure, see if there is an associated zstream + * for this block read. If so, it starts a prefetch for the stream it + * located and returns true, otherwise it returns false + */ +static int +dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) +{ + zstream_t *zs; + int64_t diff; + int reset = !prefetched; + int rc = 0; + + if (zh == NULL) + return (0); + + /* + * XXX: This locking strategy is a bit coarse; however, it's impact has + * yet to be tested. If this turns out to be an issue, it can be + * modified in a number of different ways. + */ + + rw_enter(&zf->zf_rwlock, RW_READER); +top: + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + /* + * XXX - should this be an assert? + */ + if (zs->zst_len == 0) { + /* bogus stream */ + ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); + continue; + } + + /* + * We hit this case when we are in a strided prefetch stream: + * we will read "len" blocks before "striding". + */ + if (zh->zst_offset >= zs->zst_offset && + zh->zst_offset < zs->zst_offset + zs->zst_len) { + if (prefetched) { + /* already fetched */ + ZFETCHSTAT_BUMP(zfetchstat_stride_hits); + rc = 1; + goto out; + } else { + ZFETCHSTAT_BUMP(zfetchstat_stride_misses); + } + } + + /* + * This is the forward sequential read case: we increment + * len by one each time we hit here, so we will enter this + * case on every read. + */ + if (zh->zst_offset == zs->zst_offset + zs->zst_len) { + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset + zs->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + zs->zst_len += zh->zst_len; + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_offset += diff; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : 0; + } + zs->zst_direction = ZFETCH_FORWARD; + + break; + + /* + * Same as above, but reading backwards through the file. + */ + } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { + /* backwards sequential access */ + + reset = !prefetched && zs->zst_len > 1; + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset - zh->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zh->zst_len ? + zs->zst_offset - zh->zst_len : 0; + zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? + zs->zst_ph_offset - zh->zst_len : 0; + zs->zst_len += zh->zst_len; + + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_ph_offset = zs->zst_ph_offset > diff ? + zs->zst_ph_offset - diff : 0; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : zs->zst_len; + } + zs->zst_direction = ZFETCH_BACKWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided forward access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset += zs->zst_stride; + zs->zst_direction = ZFETCH_FORWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided reverse access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zs->zst_stride ? + zs->zst_offset - zs->zst_stride : 0; + zs->zst_ph_offset = (zs->zst_ph_offset > + (2 * zs->zst_stride)) ? + (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; + zs->zst_direction = ZFETCH_BACKWARD; + + break; + } + } + + if (zs) { + if (reset) { + zstream_t *remove = zs; + + ZFETCHSTAT_BUMP(zfetchstat_stream_resets); + rc = 0; + mutex_exit(&zs->zst_lock); + rw_exit(&zf->zf_rwlock); + rw_enter(&zf->zf_rwlock, RW_WRITER); + /* + * Relocate the stream, in case someone removes + * it while we were acquiring the WRITER lock. + */ + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + if (zs == remove) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + break; + } + } + } else { + ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); + rc = 1; + dmu_zfetch_dofetch(zf, zs); + mutex_exit(&zs->zst_lock); + } + } +out: + rw_exit(&zf->zf_rwlock); + return (rc); +} + +/* + * Clean-up state associated with a zfetch structure. This frees allocated + * structure members, empties the zf_stream tree, and generally makes things + * nice. This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_rele(zfetch_t *zf) +{ + zstream_t *zs; + zstream_t *zs_next; + + ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); + + for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + } + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); + + zf->zf_dnode = NULL; +} + +/* + * Given a zfetch and zstream structure, insert the zstream structure into the + * AVL tree contained within the zfetch structure. Peform the appropriate + * book-keeping. It is possible that another thread has inserted a stream which + * matches one that we are about to insert, so we must be sure to check for this + * case. If one is found, return failure, and let the caller cleanup the + * duplicates. + */ +static int +dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +{ + zstream_t *zs_walk; + zstream_t *zs_next; + + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { + zs_next = list_next(&zf->zf_stream, zs_walk); + + if (dmu_zfetch_streams_equal(zs_walk, zs)) { + return (0); + } + } + + list_insert_head(&zf->zf_stream, zs); + zf->zf_stream_cnt++; + return (1); +} + + +/* + * Walk the list of zstreams in the given zfetch, find an old one (by time), and + * reclaim it for use by the caller. + */ +static zstream_t * +dmu_zfetch_stream_reclaim(zfetch_t *zf) +{ + zstream_t *zs; + + if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) + return (0); + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) + break; + } + + if (zs) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + bzero(zs, sizeof (zstream_t)); + } else { + zf->zf_alloc_fail++; + } + rw_exit(&zf->zf_rwlock); + + return (zs); +} + +/* + * Given a zfetch and zstream structure, remove the zstream structure from its + * container in the zfetch structure. Perform the appropriate book-keeping. + */ +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + list_remove(&zf->zf_stream, zs); + zf->zf_stream_cnt--; +} + +static int +dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) +{ + if (zs1->zst_offset != zs2->zst_offset) + return (0); + + if (zs1->zst_len != zs2->zst_len) + return (0); + + if (zs1->zst_stride != zs2->zst_stride) + return (0); + + if (zs1->zst_ph_offset != zs2->zst_ph_offset) + return (0); + + if (zs1->zst_cap != zs2->zst_cap) + return (0); + + if (zs1->zst_direction != zs2->zst_direction) + return (0); + + return (1); +} + +/* + * This is the prefetch entry point. It calls all of the other dmu_zfetch + * routines to create, delete, find, or operate upon prefetch streams. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) +{ + zstream_t zst; + zstream_t *newstream; + int fetched; + int inserted; + unsigned int blkshft; + uint64_t blksz; + + if (zfs_prefetch_disable) + return; + + /* files that aren't ln2 blocksz are only one block -- nothing to do */ + if (!zf->zf_dnode->dn_datablkshift) + return; + + /* convert offset and size, into blockid and nblocks */ + blkshft = zf->zf_dnode->dn_datablkshift; + blksz = (1 << blkshft); + + bzero(&zst, sizeof (zstream_t)); + zst.zst_offset = offset >> blkshft; + zst.zst_len = (P2ROUNDUP(offset + size, blksz) - + P2ALIGN(offset, blksz)) >> blkshft; + + fetched = dmu_zfetch_find(zf, &zst, prefetched); + if (fetched) { + ZFETCHSTAT_BUMP(zfetchstat_hits); + } else { + ZFETCHSTAT_BUMP(zfetchstat_misses); + if (fetched = dmu_zfetch_colinear(zf, &zst)) { + ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); + } else { + ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); + } + } + + if (!fetched) { + newstream = dmu_zfetch_stream_reclaim(zf); + + /* + * we still couldn't find a stream, drop the lock, and allocate + * one if possible. Otherwise, give up and go home. + */ + if (newstream) { + ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); + } else { + uint64_t maxblocks; + uint32_t max_streams; + uint32_t cur_streams; + + ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); + cur_streams = zf->zf_stream_cnt; + maxblocks = zf->zf_dnode->dn_maxblkid; + + max_streams = MIN(zfetch_max_streams, + (maxblocks / zfetch_block_cap)); + if (max_streams == 0) { + max_streams++; + } + + if (cur_streams >= max_streams) { + return; + } + newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); + } + + newstream->zst_offset = zst.zst_offset; + newstream->zst_len = zst.zst_len; + newstream->zst_stride = zst.zst_len; + newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; + newstream->zst_cap = zst.zst_len; + newstream->zst_direction = ZFETCH_FORWARD; + newstream->zst_last = ddi_get_lbolt(); + + mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); + + rw_enter(&zf->zf_rwlock, RW_WRITER); + inserted = dmu_zfetch_stream_insert(zf, newstream); + rw_exit(&zf->zf_rwlock); + + if (!inserted) { + mutex_destroy(&newstream->zst_lock); + kmem_free(newstream, sizeof (zstream_t)); + } + } +} diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c new file mode 100644 index 000000000000..850dd5816bf3 --- /dev/null +++ b/uts/common/fs/zfs/dnode.c @@ -0,0 +1,1993 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static int free_range_compar(const void *node1, const void *node2); + +static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ + +static dnode_phys_t dnode_phys_zero; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + dnode_t *dn = arg; + int i; + + rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + + refcount_create(&dn->dn_holds); + refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); + + for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); + avl_create(&dn->dn_ranges[i], free_range_compar, + sizeof (free_range_t), + offsetof(struct free_range, fr_node)); + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; + list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + dn->dn_moved = 0; + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + cv_destroy(&dn->dn_notxholds); + refcount_destroy(&dn->dn_holds); + refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + avl_destroy(&dn->dn_ranges[i]); + list_destroy(&dn->dn_dirty_records[i]); + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); + } + + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_free_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT3U(dn->dn_dirtyctx, ==, 0); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT3U(dn->dn_oldused, ==, 0); + ASSERT3U(dn->dn_oldflags, ==, 0); + ASSERT3U(dn->dn_olduid, ==, 0); + ASSERT3U(dn->dn_oldgid, ==, 0); + ASSERT3U(dn->dn_newuid, ==, 0); + ASSERT3U(dn->dn_newgid, ==, 0); + ASSERT3U(dn->dn_id_flags, ==, 0); + + ASSERT3U(dn->dn_dbufs_count, ==, 0); + list_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + ASSERT(dnode_cache == NULL); + dnode_cache = kmem_cache_create("dnode_t", + sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); +} + +void +dnode_fini(void) +{ + kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; +} + + +#ifdef ZFS_DEBUG +void +dnode_verify(dnode_t *dn) +{ + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + ASSERT3U(dn->dn_indblkshift, >=, 0); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} +#endif + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_used = BSWAP_64(dnp->dn_used); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + size_t len = DN_MAX_BONUSLEN - off; + ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); + dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); + +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + dnode_phys_t *buf = vbuf; + int i; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); + ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + + size >>= DNODE_SHIFT; + for (i = 0; i < size; i++) { + dnode_byteswap(buf); + buf++; + } +} + +static int +free_range_compar(const void *node1, const void *node2) +{ + const free_range_t *rp1 = node1; + const free_range_t *rp2 = node2; + + if (rp1->fr_blkid < rp2->fr_blkid) + return (-1); + else if (rp1->fr_blkid > rp2->fr_blkid) + return (1); + else return (0); +} + +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + dnode_setdirty(dn, tx); + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object, dnode_handle_t *dnh) +{ + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_handle = dnh; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) { + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); + dn->dn_id_flags = 0; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + + mutex_enter(&os->os_lock); + list_insert_head(&os->os_dnodes, dn); + membar_producer(); + /* + * Everything else must be valid before assigning dn_objset makes the + * dnode eligible for dnode_move(). + */ + dn->dn_objset = os; + mutex_exit(&os->os_lock); + + arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); + return (dn); +} + +/* + * Caller must be holding the dnode handle, which is released upon return. + */ +static void +dnode_destroy(dnode_t *dn) +{ + objset_t *os = dn->dn_objset; + + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); + + mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); + list_remove(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + if (dn->dn_bonus != NULL) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_rele(&dn->dn_zfetch); + kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i; + + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + else if (blocksize > SPA_MAXBLOCKSIZE) + blocksize = SPA_MAXBLOCKSIZE; + else + blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, + dn->dn_object, tx->tx_txg, blocksize, ibs); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT3U(dn->dn_maxblkid, ==, 0); + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); + ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else + dn->dn_nblkptr = 1 + + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + + dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; + + dnode_setdirty(dn, tx); + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int nblkptr; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0) || + (bonustype == DMU_OT_SA && bonuslen == 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + + /* clean up any unreferenced dbufs */ + dnode_evict_dbufs(dn); + + dn->dn_id_flags = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_setdirty(dn, tx); + if (dn->dn_datablksz != blocksize) { + /* change blocksize */ + ASSERT(dn->dn_maxblkid == 0 && + (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + dnode_block_freed(dn, 0))); + dnode_setdblksz(dn, blocksize); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + } + if (dn->dn_bonuslen != bonuslen) + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; + + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + nblkptr = 1; + else + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; + if (dn->dn_nblkptr != nblkptr) + dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } + rw_exit(&dn->dn_struct_rwlock); + + /* change type */ + dn->dn_type = ot; + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_nblkptr = nblkptr; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + /* fix up the bonus db_size */ + if (dn->dn_bonus) { + dn->dn_bonus->db.db_size = + DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); +} + +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(list_is_empty(&ndn->dn_dbufs)); + list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; + ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_ranges[i].avl_root = NULL; + odn->dn_ranges[i].avl_numnodes = 0; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef _KERNEL +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ + +void +dnode_special_close(dnode_handle_t *dnh) +{ + dnode_t *dn = dnh->dnh_dnode; + + /* + * Wait for final references to the dnode to clear. This can + * only happen if the arc is asyncronously evicting state that + * has a hold on this dnode while we are trying to evict this + * dnode. + */ + while (refcount_count(&dn->dn_holds) > 0) + delay(1); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; +} + +dnode_t * +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) +{ + dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); + dnh->dnh_dnode = dn; + zrl_init(&dnh->dnh_zrlock); + DNODE_VERIFY(dn); + return (dn); +} + +static void +dnode_buf_pageout(dmu_buf_t *db, void *arg) +{ + dnode_children_t *children_dnodes = arg; + int i; + int epb = db->db_size >> DNODE_SHIFT; + + ASSERT(epb == children_dnodes->dnc_count); + + for (i = 0; i < epb; i++) { + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; + + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); + continue; + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligible for eviction and this function + * would not have been called. + */ + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; + } + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); +} + +/* + * errors: + * EINVAL - invalid object number. + * EIO - i/o error. + * succeeds even for free dnodes. + */ +int +dnode_hold_impl(objset_t *os, uint64_t object, int flag, + void *tag, dnode_t **dnp) +{ + int epb, idx, err; + int drop_struct_lock = FALSE; + int type; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; + + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything* unless it's the root pool + * which may require us to read from the root filesystem while + * holding some (not all) of the locks as writer. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || + (spa_is_root(os->os_spa) && + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); + + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { + dn = (object == DMU_USERUSED_OBJECT) ? + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); + if (dn == NULL) + return (ENOENT); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (ENOENT); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (EEXIST); + DNODE_VERIFY(dn); + (void) refcount_add(&dn->dn_holds, tag); + *dnp = dn; + return (0); + } + + if (object == 0 || object >= DN_MAX_OBJECT) + return (EINVAL); + + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); + + DNODE_VERIFY(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + + db = dbuf_hold(mdn, blk, FTAG); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, FTAG); + return (err); + } + + ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); + epb = db->db.db_size >> DNODE_SHIFT; + + idx = object & (epb-1); + + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + int i; + dnode_children_t *winner; + children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + dnh[i].dnh_dnode = NULL; + } + if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, + dnode_buf_pageout)) { + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); + children_dnodes = winner; + } + } + ASSERT(children_dnodes->dnc_count == epb); + + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + if ((dn = dnh->dnh_dnode) == NULL) { + dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + dnode_t *winner; + + dn = dnode_create(os, phys, db, object, dnh); + winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); + if (winner != NULL) { + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + dn = winner; + } + } + + mutex_enter(&dn->dn_mtx); + type = dn->dn_type; + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && + (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { + mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); + dbuf_rele(db, FTAG); + return (type == DMU_OT_NONE ? ENOENT : EEXIST); + } + mutex_exit(&dn->dn_mtx); + + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dnh); + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); + + DNODE_VERIFY(dn); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db, FTAG); + + *dnp = dn; + return (0); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +int +dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); +} + +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t +dnode_add_ref(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); +} + +void +dnode_rele(dnode_t *dn, void *tag) +{ + uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; + + mutex_enter(&dn->dn_mtx); + refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + dbuf_rele(db, dnh); + } +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); + return; + } + + DNODE_VERIFY(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); + mutex_exit(&dn->dn_mtx); +#endif + + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE, tx); + + mutex_enter(&os->os_lock); + + /* + * If we are already marked dirty, we're done. + */ + if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + mutex_exit(&os->os_lock); + return; + } + + ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); + ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0); + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { + list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); + } else { + list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); + } + + mutex_exit(&os->os_lock); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintains a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + + (void) dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); + + /* we should be the only holder... hopefully */ + /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + /* + * If the dnode is already dirty, it needs to be moved from + * the dirty list to the free list. + */ + mutex_enter(&dn->dn_objset->os_lock); + if (list_link_active(&dn->dn_dirty_link[txgoff])) { + list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); + list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); + mutex_exit(&dn->dn_objset->os_lock); + } else { + mutex_exit(&dn->dn_objset->os_lock); + dnode_setdirty(dn, tx); + } +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + int err; + + if (size == 0) + size = SPA_MINBLOCKSIZE; + if (size > SPA_MAXBLOCKSIZE) + size = SPA_MAXBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == dn->dn_indblkshift) + ibs = 0; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_phys->dn_maxblkid != 0) + goto fail; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto fail; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + if (ibs && dn->dn_nlevels != 1) + goto fail; + + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + if (err == 0) + dbuf_new_size(db, size, tx); + else if (err != ENOENT) + goto fail; + + dnode_setdblksz(dn, size); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (ibs) { + dn->dn_indblkshift = ibs; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + } + /* rele after we have fixed the blocksize in the dnode */ + if (db) + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); + return (0); + +fail: + rw_exit(&dn->dn_struct_rwlock); + return (ENOTSUP); +} + +/* read-holding callers must not rely on the lock being continuously held */ +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int epbs, new_nlevels; + uint64_t sz; + + ASSERT(blkid != DMU_BONUS_BLKID); + + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } + } + + if (blkid <= dn->dn_maxblkid) + goto out; + + dn->dn_maxblkid = blkid; + + /* + * Compute the number of levels necessary to support the new maxblkid. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (sz = dn->dn_nblkptr; + sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) + new_nlevels++; + + if (new_nlevels > dn->dn_nlevels) { + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTA |