aboutsummaryrefslogtreecommitdiffstats
path: root/uts
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2012-07-18 08:12:04 +0000
committerMartin Matuska <mm@FreeBSD.org>2012-07-18 08:12:04 +0000
commitaf56e8c4b416d774961b41eee1eb349d657ebb8c (patch)
treee332d1e6089905f45302dedddb9967a87ade136a /uts
parent93a00b0821525e25814cd720fafd04d600811c28 (diff)
downloadsrc-vendor/opensolaris.tar.gz
src-vendor/opensolaris.zip
Update vendor-sys/opensolaris to last OpenSolaris state (13149:b23a4dab3d50)vendor/opensolaris/20100818vendor/opensolaris
Add ZFS bits to vendor-sys/opensolaris Obtained from: https://hg.openindiana.org/upstream/oracle/onnv-gate
Notes
Notes: svn path=/vendor-sys/opensolaris/dist/; revision=238567 svn path=/vendor-sys/opensolaris/20100818/; revision=238568; tag=vendor/opensolaris/20100818
Diffstat (limited to 'uts')
-rw-r--r--uts/common/Makefile.files2007
-rw-r--r--uts/common/dtrace/dtrace.c204
-rw-r--r--uts/common/dtrace/fasttrap.c25
-rw-r--r--uts/common/dtrace/lockstat.c10
-rw-r--r--uts/common/dtrace/profile.c9
-rw-r--r--uts/common/dtrace/sdt_subr.c311
-rw-r--r--uts/common/dtrace/systrace.c11
-rw-r--r--uts/common/fs/gfs.c1178
-rw-r--r--uts/common/fs/vnode.c4536
-rw-r--r--uts/common/fs/zfs/arc.c4658
-rw-r--r--uts/common/fs/zfs/bplist.c69
-rw-r--r--uts/common/fs/zfs/bpobj.c495
-rw-r--r--uts/common/fs/zfs/dbuf.c2707
-rw-r--r--uts/common/fs/zfs/ddt.c1146
-rw-r--r--uts/common/fs/zfs/ddt_zap.c157
-rw-r--r--uts/common/fs/zfs/dmu.c1764
-rw-r--r--uts/common/fs/zfs/dmu_diff.c221
-rw-r--r--uts/common/fs/zfs/dmu_object.c196
-rw-r--r--uts/common/fs/zfs/dmu_objset.c1789
-rw-r--r--uts/common/fs/zfs/dmu_send.c1606
-rw-r--r--uts/common/fs/zfs/dmu_traverse.c482
-rw-r--r--uts/common/fs/zfs/dmu_tx.c1382
-rw-r--r--uts/common/fs/zfs/dmu_zfetch.c724
-rw-r--r--uts/common/fs/zfs/dnode.c1993
-rw-r--r--uts/common/fs/zfs/dnode_sync.c693
-rw-r--r--uts/common/fs/zfs/dsl_dataset.c4030
-rw-r--r--uts/common/fs/zfs/dsl_deadlist.c474
-rw-r--r--uts/common/fs/zfs/dsl_deleg.c746
-rw-r--r--uts/common/fs/zfs/dsl_dir.c1416
-rw-r--r--uts/common/fs/zfs/dsl_pool.c848
-rw-r--r--uts/common/fs/zfs/dsl_prop.c1153
-rw-r--r--uts/common/fs/zfs/dsl_scan.c1766
-rw-r--r--uts/common/fs/zfs/dsl_synctask.c240
-rw-r--r--uts/common/fs/zfs/gzip.c69
-rw-r--r--uts/common/fs/zfs/lzjb.c123
-rw-r--r--uts/common/fs/zfs/metaslab.c1604
-rw-r--r--uts/common/fs/zfs/refcount.c223
-rw-r--r--uts/common/fs/zfs/rrwlock.c264
-rw-r--r--uts/common/fs/zfs/sa.c1970
-rw-r--r--uts/common/fs/zfs/sha256.c50
-rw-r--r--uts/common/fs/zfs/spa.c5882
-rw-r--r--uts/common/fs/zfs/spa_config.c487
-rw-r--r--uts/common/fs/zfs/spa_errlog.c403
-rw-r--r--uts/common/fs/zfs/spa_history.c502
-rw-r--r--uts/common/fs/zfs/spa_misc.c1672
-rw-r--r--uts/common/fs/zfs/space_map.c616
-rw-r--r--uts/common/fs/zfs/sys/arc.h142
-rw-r--r--uts/common/fs/zfs/sys/bplist.h57
-rw-r--r--uts/common/fs/zfs/sys/bpobj.h91
-rw-r--r--uts/common/fs/zfs/sys/dbuf.h375
-rw-r--r--uts/common/fs/zfs/sys/ddt.h246
-rw-r--r--uts/common/fs/zfs/sys/dmu.h740
-rw-r--r--uts/common/fs/zfs/sys/dmu_impl.h272
-rw-r--r--uts/common/fs/zfs/sys/dmu_objset.h183
-rw-r--r--uts/common/fs/zfs/sys/dmu_traverse.h64
-rw-r--r--uts/common/fs/zfs/sys/dmu_tx.h148
-rw-r--r--uts/common/fs/zfs/sys/dmu_zfetch.h76
-rw-r--r--uts/common/fs/zfs/sys/dnode.h329
-rw-r--r--uts/common/fs/zfs/sys/dsl_dataset.h283
-rw-r--r--uts/common/fs/zfs/sys/dsl_deadlist.h87
-rw-r--r--uts/common/fs/zfs/sys/dsl_deleg.h78
-rw-r--r--uts/common/fs/zfs/sys/dsl_dir.h167
-rw-r--r--uts/common/fs/zfs/sys/dsl_pool.h151
-rw-r--r--uts/common/fs/zfs/sys/dsl_prop.h119
-rw-r--r--uts/common/fs/zfs/sys/dsl_scan.h108
-rw-r--r--uts/common/fs/zfs/sys/dsl_synctask.h79
-rw-r--r--uts/common/fs/zfs/sys/metaslab.h80
-rw-r--r--uts/common/fs/zfs/sys/metaslab_impl.h89
-rw-r--r--uts/common/fs/zfs/sys/refcount.h107
-rw-r--r--uts/common/fs/zfs/sys/rrwlock.h80
-rw-r--r--uts/common/fs/zfs/sys/sa.h170
-rw-r--r--uts/common/fs/zfs/sys/sa_impl.h287
-rw-r--r--uts/common/fs/zfs/sys/spa.h706
-rw-r--r--uts/common/fs/zfs/sys/spa_boot.h42
-rw-r--r--uts/common/fs/zfs/sys/spa_impl.h235
-rw-r--r--uts/common/fs/zfs/sys/space_map.h179
-rw-r--r--uts/common/fs/zfs/sys/txg.h131
-rw-r--r--uts/common/fs/zfs/sys/txg_impl.h75
-rw-r--r--uts/common/fs/zfs/sys/uberblock.h46
-rw-r--r--uts/common/fs/zfs/sys/uberblock_impl.h63
-rw-r--r--uts/common/fs/zfs/sys/unique.h59
-rw-r--r--uts/common/fs/zfs/sys/vdev.h161
-rw-r--r--uts/common/fs/zfs/sys/vdev_disk.h56
-rw-r--r--uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--uts/common/fs/zfs/sys/vdev_impl.h322
-rw-r--r--uts/common/fs/zfs/sys/zap.h482
-rw-r--r--uts/common/fs/zfs/sys/zap_impl.h228
-rw-r--r--uts/common/fs/zfs/sys/zap_leaf.h245
-rw-r--r--uts/common/fs/zfs/sys/zfs_acl.h245
-rw-r--r--uts/common/fs/zfs/sys/zfs_context.h73
-rw-r--r--uts/common/fs/zfs/sys/zfs_ctldir.h73
-rw-r--r--uts/common/fs/zfs/sys/zfs_debug.h82
-rw-r--r--uts/common/fs/zfs/sys/zfs_dir.h74
-rw-r--r--uts/common/fs/zfs/sys/zfs_fuid.h131
-rw-r--r--uts/common/fs/zfs/sys/zfs_ioctl.h349
-rw-r--r--uts/common/fs/zfs/sys/zfs_onexit.h66
-rw-r--r--uts/common/fs/zfs/sys/zfs_rlock.h89
-rw-r--r--uts/common/fs/zfs/sys/zfs_sa.h143
-rw-r--r--uts/common/fs/zfs/sys/zfs_stat.h56
-rw-r--r--uts/common/fs/zfs/sys/zfs_vfsops.h159
-rw-r--r--uts/common/fs/zfs/sys/zfs_znode.h361
-rw-r--r--uts/common/fs/zfs/sys/zil.h428
-rw-r--r--uts/common/fs/zfs/sys/zil_impl.h147
-rw-r--r--uts/common/fs/zfs/sys/zio.h559
-rw-r--r--uts/common/fs/zfs/sys/zio_checksum.h75
-rw-r--r--uts/common/fs/zfs/sys/zio_compress.h84
-rw-r--r--uts/common/fs/zfs/sys/zio_impl.h175
-rw-r--r--uts/common/fs/zfs/sys/zrlock.h66
-rw-r--r--uts/common/fs/zfs/sys/zvol.h76
-rw-r--r--uts/common/fs/zfs/txg.c724
-rw-r--r--uts/common/fs/zfs/uberblock.c61
-rw-r--r--uts/common/fs/zfs/unique.c116
-rw-r--r--uts/common/fs/zfs/vdev.c3130
-rw-r--r--uts/common/fs/zfs/vdev_cache.c416
-rw-r--r--uts/common/fs/zfs/vdev_disk.c610
-rw-r--r--uts/common/fs/zfs/vdev_file.c217
-rw-r--r--uts/common/fs/zfs/vdev_label.c1216
-rw-r--r--uts/common/fs/zfs/vdev_mirror.c485
-rw-r--r--uts/common/fs/zfs/vdev_missing.c100
-rw-r--r--uts/common/fs/zfs/vdev_queue.c406
-rw-r--r--uts/common/fs/zfs/vdev_raidz.c2146
-rw-r--r--uts/common/fs/zfs/vdev_root.c116
-rw-r--r--uts/common/fs/zfs/zap.c1354
-rw-r--r--uts/common/fs/zfs/zap_leaf.c872
-rw-r--r--uts/common/fs/zfs/zap_micro.c1455
-rw-r--r--uts/common/fs/zfs/zfs.conf28
-rw-r--r--uts/common/fs/zfs/zfs_acl.c2748
-rw-r--r--uts/common/fs/zfs/zfs_byteswap.c199
-rw-r--r--uts/common/fs/zfs/zfs_ctldir.c1349
-rw-r--r--uts/common/fs/zfs/zfs_debug.c95
-rw-r--r--uts/common/fs/zfs/zfs_dir.c1089
-rw-r--r--uts/common/fs/zfs/zfs_fm.c863
-rw-r--r--uts/common/fs/zfs/zfs_fuid.c756
-rw-r--r--uts/common/fs/zfs/zfs_ioctl.c5122
-rw-r--r--uts/common/fs/zfs/zfs_log.c676
-rw-r--r--uts/common/fs/zfs/zfs_onexit.c246
-rw-r--r--uts/common/fs/zfs/zfs_replay.c931
-rw-r--r--uts/common/fs/zfs/zfs_rlock.c602
-rw-r--r--uts/common/fs/zfs/zfs_sa.c334
-rw-r--r--uts/common/fs/zfs/zfs_vfsops.c2303
-rw-r--r--uts/common/fs/zfs/zfs_vnops.c5243
-rw-r--r--uts/common/fs/zfs/zfs_znode.c2121
-rw-r--r--uts/common/fs/zfs/zil.c1992
-rw-r--r--uts/common/fs/zfs/zio.c2952
-rw-r--r--uts/common/fs/zfs/zio_checksum.c274
-rw-r--r--uts/common/fs/zfs/zio_compress.c132
-rw-r--r--uts/common/fs/zfs/zio_inject.c515
-rw-r--r--uts/common/fs/zfs/zle.c86
-rw-r--r--uts/common/fs/zfs/zrlock.c194
-rw-r--r--uts/common/fs/zfs/zvol.c1894
-rw-r--r--uts/common/os/callb.c410
-rw-r--r--uts/common/os/fm.c1386
-rw-r--r--uts/common/os/nvpair_alloc_system.c64
-rw-r--r--uts/common/sys/acl.h302
-rw-r--r--uts/common/sys/acl_impl.h61
-rw-r--r--uts/common/sys/avl.h309
-rw-r--r--uts/common/sys/avl_impl.h164
-rw-r--r--uts/common/sys/bitmap.h194
-rw-r--r--uts/common/sys/callb.h213
-rw-r--r--uts/common/sys/ccompile.h127
-rw-r--r--uts/common/sys/compress.h46
-rw-r--r--uts/common/sys/cpupart.h27
-rw-r--r--uts/common/sys/cpuvar.h116
-rw-r--r--uts/common/sys/cred.h193
-rw-r--r--uts/common/sys/debug.h146
-rw-r--r--uts/common/sys/dtrace.h25
-rw-r--r--uts/common/sys/errorq.h83
-rw-r--r--uts/common/sys/extdirent.h77
-rw-r--r--uts/common/sys/feature_tests.h396
-rw-r--r--uts/common/sys/fm/fs/zfs.h96
-rw-r--r--uts/common/sys/fm/protocol.h371
-rw-r--r--uts/common/sys/fm/util.h103
-rw-r--r--uts/common/sys/fs/zfs.h912
-rw-r--r--uts/common/sys/fs/zut.h93
-rw-r--r--uts/common/sys/gfs.h173
-rw-r--r--uts/common/sys/idmap.h97
-rw-r--r--uts/common/sys/isa_defs.h487
-rw-r--r--uts/common/sys/list.h67
-rw-r--r--uts/common/sys/list_impl.h53
-rw-r--r--uts/common/sys/note.h56
-rw-r--r--uts/common/sys/nvpair.h281
-rw-r--r--uts/common/sys/nvpair_impl.h73
-rw-r--r--uts/common/sys/processor.h149
-rw-r--r--uts/common/sys/procset.h160
-rw-r--r--uts/common/sys/synch.h162
-rw-r--r--uts/common/sys/sysevent.h283
-rw-r--r--uts/common/sys/sysevent/dev.h256
-rw-r--r--uts/common/sys/sysevent/eventdefs.h275
-rw-r--r--uts/common/sys/sysmacros.h378
-rw-r--r--uts/common/sys/taskq.h92
-rw-r--r--uts/common/sys/u8_textprep.h113
-rw-r--r--uts/common/sys/u8_textprep_data.h35376
-rw-r--r--uts/common/sys/vnode.h1431
-rw-r--r--uts/common/sys/zmod.h68
-rw-r--r--uts/common/zmod/adler32.c149
-rw-r--r--uts/common/zmod/crc32.c428
-rw-r--r--uts/common/zmod/crc32.h443
-rw-r--r--uts/common/zmod/deflate.c1742
-rw-r--r--uts/common/zmod/deflate.h331
-rw-r--r--uts/common/zmod/inffast.c320
-rw-r--r--uts/common/zmod/inffast.h13
-rw-r--r--uts/common/zmod/inffixed.h96
-rw-r--r--uts/common/zmod/inflate.c1395
-rw-r--r--uts/common/zmod/inflate.h117
-rw-r--r--uts/common/zmod/inftrees.c331
-rw-r--r--uts/common/zmod/inftrees.h57
-rw-r--r--uts/common/zmod/trees.c1219
-rw-r--r--uts/common/zmod/zconf.h117
-rw-r--r--uts/common/zmod/zlib.h1359
-rw-r--r--uts/common/zmod/zmod.c113
-rw-r--r--uts/common/zmod/zmod_subr.c85
-rw-r--r--uts/common/zmod/zutil.c324
-rw-r--r--uts/common/zmod/zutil.h274
213 files changed, 165021 insertions, 110 deletions
diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files
new file mode 100644
index 000000000000..ec08410b4ff3
--- /dev/null
+++ b/uts/common/Makefile.files
@@ -0,0 +1,2007 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# This Makefile defines all file modules for the directory uts/common
+# and its children. These are the source files which may be considered
+# common to all SunOS systems.
+
+i386_CORE_OBJS += \
+ atomic.o \
+ avintr.o \
+ pic.o
+
+sparc_CORE_OBJS +=
+
+COMMON_CORE_OBJS += \
+ beep.o \
+ bitset.o \
+ bp_map.o \
+ brand.o \
+ cpucaps.o \
+ cmt.o \
+ cmt_policy.o \
+ cpu.o \
+ cpu_event.o \
+ cpu_intr.o \
+ cpu_pm.o \
+ cpupart.o \
+ cap_util.o \
+ disp.o \
+ group.o \
+ kstat_fr.o \
+ iscsiboot_prop.o \
+ lgrp.o \
+ lgrp_topo.o \
+ mmapobj.o \
+ mutex.o \
+ page_lock.o \
+ page_retire.o \
+ panic.o \
+ param.o \
+ pg.o \
+ pghw.o \
+ putnext.o \
+ rctl_proc.o \
+ rwlock.o \
+ seg_kmem.o \
+ softint.o \
+ string.o \
+ strtol.o \
+ strtoul.o \
+ strtoll.o \
+ strtoull.o \
+ thread_intr.o \
+ vm_page.o \
+ vm_pagelist.o \
+ zlib_obj.o \
+ clock_tick.o
+
+CORE_OBJS += $(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS)
+
+ZLIB_OBJS = zutil.o zmod.o zmod_subr.o \
+ adler32.o crc32.o deflate.o inffast.o \
+ inflate.o inftrees.o trees.o
+
+GENUNIX_OBJS += \
+ access.o \
+ acl.o \
+ acl_common.o \
+ adjtime.o \
+ alarm.o \
+ aio_subr.o \
+ auditsys.o \
+ audit_core.o \
+ audit_zone.o \
+ audit_memory.o \
+ autoconf.o \
+ avl.o \
+ bdev_dsort.o \
+ bio.o \
+ bitmap.o \
+ blabel.o \
+ brandsys.o \
+ bz2blocksort.o \
+ bz2compress.o \
+ bz2decompress.o \
+ bz2randtable.o \
+ bz2bzlib.o \
+ bz2crctable.o \
+ bz2huffman.o \
+ callb.o \
+ callout.o \
+ chdir.o \
+ chmod.o \
+ chown.o \
+ cladm.o \
+ class.o \
+ clock.o \
+ clock_highres.o \
+ clock_realtime.o\
+ close.o \
+ compress.o \
+ condvar.o \
+ conf.o \
+ console.o \
+ contract.o \
+ copyops.o \
+ core.o \
+ corectl.o \
+ cred.o \
+ cs_stubs.o \
+ dacf.o \
+ dacf_clnt.o \
+ damap.o \
+ cyclic.o \
+ ddi.o \
+ ddifm.o \
+ ddi_hp_impl.o \
+ ddi_hp_ndi.o \
+ ddi_intr.o \
+ ddi_intr_impl.o \
+ ddi_intr_irm.o \
+ ddi_nodeid.o \
+ ddi_timer.o \
+ devcfg.o \
+ devcache.o \
+ device.o \
+ devid.o \
+ devid_cache.o \
+ devid_scsi.o \
+ devid_smp.o \
+ devpolicy.o \
+ disp_lock.o \
+ dnlc.o \
+ driver.o \
+ dumpsubr.o \
+ driver_lyr.o \
+ dtrace_subr.o \
+ errorq.o \
+ etheraddr.o \
+ evchannels.o \
+ exacct.o \
+ exacct_core.o \
+ exec.o \
+ exit.o \
+ fbio.o \
+ fcntl.o \
+ fdbuffer.o \
+ fdsync.o \
+ fem.o \
+ ffs.o \
+ fio.o \
+ flock.o \
+ fm.o \
+ fork.o \
+ vpm.o \
+ fs_reparse.o \
+ fs_subr.o \
+ fsflush.o \
+ ftrace.o \
+ getcwd.o \
+ getdents.o \
+ getloadavg.o \
+ getpagesizes.o \
+ getpid.o \
+ gfs.o \
+ rusagesys.o \
+ gid.o \
+ groups.o \
+ grow.o \
+ hat.o \
+ hat_refmod.o \
+ id32.o \
+ id_space.o \
+ inet_ntop.o \
+ instance.o \
+ ioctl.o \
+ ip_cksum.o \
+ issetugid.o \
+ ippconf.o \
+ kcpc.o \
+ kdi.o \
+ kiconv.o \
+ klpd.o \
+ kmem.o \
+ ksyms_snapshot.o \
+ l_strplumb.o \
+ labelsys.o \
+ link.o \
+ list.o \
+ lockstat_subr.o \
+ log_sysevent.o \
+ logsubr.o \
+ lookup.o \
+ lseek.o \
+ ltos.o \
+ lwp.o \
+ lwp_create.o \
+ lwp_info.o \
+ lwp_self.o \
+ lwp_sobj.o \
+ lwp_timer.o \
+ lwpsys.o \
+ main.o \
+ mmapobjsys.o \
+ memcntl.o \
+ memstr.o \
+ lgrpsys.o \
+ mkdir.o \
+ mknod.o \
+ mount.o \
+ move.o \
+ msacct.o \
+ multidata.o \
+ nbmlock.o \
+ ndifm.o \
+ nice.o \
+ netstack.o \
+ ntptime.o \
+ nvpair.o \
+ nvpair_alloc_system.o \
+ nvpair_alloc_fixed.o \
+ octet.o \
+ open.o \
+ p_online.o \
+ pathconf.o \
+ pathname.o \
+ pause.o \
+ serializer.o \
+ pci_intr_lib.o \
+ pci_cap.o \
+ pcifm.o \
+ pgrp.o \
+ pgrpsys.o \
+ pid.o \
+ pkp_hash.o \
+ policy.o \
+ poll.o \
+ pool.o \
+ pool_pset.o \
+ port_subr.o \
+ ppriv.o \
+ printf.o \
+ priocntl.o \
+ priv.o \
+ priv_const.o \
+ proc.o \
+ procset.o \
+ processor_bind.o \
+ processor_info.o \
+ profil.o \
+ project.o \
+ qsort.o \
+ rctl.o \
+ rctlsys.o \
+ readlink.o \
+ refstr.o \
+ rename.o \
+ resolvepath.o \
+ retire_store.o \
+ process.o \
+ rlimit.o \
+ rmap.o \
+ rw.o \
+ rwstlock.o \
+ sad_conf.o \
+ sid.o \
+ sidsys.o \
+ sched.o \
+ schedctl.o \
+ sctp_crc32.o \
+ seg_dev.o \
+ seg_kp.o \
+ seg_kpm.o \
+ seg_map.o \
+ seg_vn.o \
+ seg_spt.o \
+ semaphore.o \
+ sendfile.o \
+ session.o \
+ share.o \
+ shuttle.o \
+ sig.o \
+ sigaction.o \
+ sigaltstack.o \
+ signotify.o \
+ sigpending.o \
+ sigprocmask.o \
+ sigqueue.o \
+ sigsendset.o \
+ sigsuspend.o \
+ sigtimedwait.o \
+ sleepq.o \
+ sock_conf.o \
+ space.o \
+ sscanf.o \
+ stat.o \
+ statfs.o \
+ statvfs.o \
+ stol.o \
+ str_conf.o \
+ strcalls.o \
+ stream.o \
+ streamio.o \
+ strext.o \
+ strsubr.o \
+ strsun.o \
+ subr.o \
+ sunddi.o \
+ sunmdi.o \
+ sunndi.o \
+ sunpci.o \
+ sunpm.o \
+ sundlpi.o \
+ suntpi.o \
+ swap_subr.o \
+ swap_vnops.o \
+ symlink.o \
+ sync.o \
+ sysclass.o \
+ sysconfig.o \
+ sysent.o \
+ sysfs.o \
+ systeminfo.o \
+ task.o \
+ taskq.o \
+ tasksys.o \
+ time.o \
+ timer.o \
+ times.o \
+ timers.o \
+ thread.o \
+ tlabel.o \
+ tnf_res.o \
+ turnstile.o \
+ tty_common.o \
+ u8_textprep.o \
+ uadmin.o \
+ uconv.o \
+ ucredsys.o \
+ uid.o \
+ umask.o \
+ umount.o \
+ uname.o \
+ unix_bb.o \
+ unlink.o \
+ urw.o \
+ utime.o \
+ utssys.o \
+ uucopy.o \
+ vfs.o \
+ vfs_conf.o \
+ vmem.o \
+ vm_anon.o \
+ vm_as.o \
+ vm_meter.o \
+ vm_pageout.o \
+ vm_pvn.o \
+ vm_rm.o \
+ vm_seg.o \
+ vm_subr.o \
+ vm_swap.o \
+ vm_usage.o \
+ vnode.o \
+ vuid_queue.o \
+ vuid_store.o \
+ waitq.o \
+ watchpoint.o \
+ yield.o \
+ scsi_confdata.o \
+ xattr.o \
+ xattr_common.o \
+ xdr_mblk.o \
+ xdr_mem.o \
+ xdr.o \
+ xdr_array.o \
+ xdr_refer.o \
+ xhat.o \
+ zone.o
+
+#
+# Stubs for the stand-alone linker/loader
+#
+sparc_GENSTUBS_OBJS = \
+ kobj_stubs.o
+
+i386_GENSTUBS_OBJS =
+
+COMMON_GENSTUBS_OBJS =
+
+GENSTUBS_OBJS += $(COMMON_GENSTUBS_OBJS) $($(MACH)_GENSTUBS_OBJS)
+
+#
+# DTrace and DTrace Providers
+#
+DTRACE_OBJS += dtrace.o dtrace_isa.o dtrace_asm.o
+
+SDT_OBJS += sdt_subr.o
+
+PROFILE_OBJS += profile.o
+
+SYSTRACE_OBJS += systrace.o
+
+LOCKSTAT_OBJS += lockstat.o
+
+FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o
+
+DCPC_OBJS += dcpc.o
+
+#
+# Driver (pseudo-driver) Modules
+#
+IPP_OBJS += ippctl.o
+
+AUDIO_OBJS += audio_client.o audio_ddi.o audio_engine.o \
+ audio_fltdata.o audio_format.o audio_ctrl.o \
+ audio_grc3.o audio_output.o audio_input.o \
+ audio_oss.o audio_sun.o
+
+AUDIOEMU10K_OBJS += audioemu10k.o
+
+AUDIOENS_OBJS += audioens.o
+
+AUDIOVIA823X_OBJS += audiovia823x.o
+
+AUDIOVIA97_OBJS += audiovia97.o
+
+AUDIO1575_OBJS += audio1575.o
+
+AUDIO810_OBJS += audio810.o
+
+AUDIOCMI_OBJS += audiocmi.o
+
+AUDIOHD_OBJS += audiohd.o
+
+AUDIOIXP_OBJS += audioixp.o
+
+AUDIOLS_OBJS += audiols.o
+
+AUDIOP16X_OBJS += audiop16x.o
+
+AUDIOPCI_OBJS += audiopci.o
+
+AUDIOSOLO_OBJS += audiosolo.o
+
+AUDIOTS_OBJS += audiots.o
+
+AC97_OBJS += ac97.o ac97_ad.o ac97_alc.o ac97_cmi.o
+
+BLKDEV_OBJS += blkdev.o
+
+CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o
+
+CONSKBD_OBJS += conskbd.o
+
+CONSMS_OBJS += consms.o
+
+OLDPTY_OBJS += tty_ptyconf.o
+
+PTC_OBJS += tty_pty.o
+
+PTSL_OBJS += tty_pts.o
+
+PTM_OBJS += ptm.o
+
+MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \
+ mii_marvell.o mii_realtek.o mii_other.o
+
+PTS_OBJS += pts.o
+
+PTY_OBJS += ptms_conf.o
+
+SAD_OBJS += sad.o
+
+MD4_OBJS += md4.o md4_mod.o
+
+MD5_OBJS += md5.o md5_mod.o
+
+SHA1_OBJS += sha1.o sha1_mod.o fips_sha1_util.o
+
+SHA2_OBJS += sha2.o sha2_mod.o fips_sha2_util.o
+
+IPGPC_OBJS += classifierddi.o classifier.o filters.o trie.o table.o \
+ ba_table.o
+
+DSCPMK_OBJS += dscpmk.o dscpmkddi.o
+
+DLCOSMK_OBJS += dlcosmk.o dlcosmkddi.o
+
+FLOWACCT_OBJS += flowacctddi.o flowacct.o
+
+TOKENMT_OBJS += tokenmt.o tokenmtddi.o
+
+TSWTCL_OBJS += tswtcl.o tswtclddi.o
+
+ARP_OBJS += arpddi.o
+
+ICMP_OBJS += icmpddi.o
+
+ICMP6_OBJS += icmp6ddi.o
+
+RTS_OBJS += rtsddi.o
+
+IP_ICMP_OBJS = icmp.o icmp_opt_data.o
+IP_RTS_OBJS = rts.o rts_opt_data.o
+IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_opt_data.o tcp_sack.o tcp_stats.o \
+ tcp_misc.o tcp_timers.o tcp_time_wait.o tcp_tpi.o tcp_output.o \
+ tcp_input.o tcp_socket.o tcp_bind.o tcp_cluster.o tcp_tunables.o
+IP_UDP_OBJS = udp.o udp_opt_data.o udp_tunables.o udp_stats.o
+IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \
+ sctp_init.o sctp_input.o sctp_cookie.o \
+ sctp_conn.o sctp_error.o sctp_snmp.o \
+ sctp_tunables.o sctp_shutdown.o sctp_common.o \
+ sctp_timer.o sctp_heartbeat.o sctp_hash.o \
+ sctp_bind.o sctp_notify.o sctp_asconf.o \
+ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \
+ sctp_misc.o
+IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
+
+IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
+ ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+ ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \
+ ipddi.o ipdrop.o mi.o nd.o tunables.o optcom.o snmpcom.o \
+ ipsec_loader.o spd.o ipclassifier.o inet_common.o ip_squeue.o \
+ squeue.o ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \
+ ip_helper_stream.o ip_tunables.o \
+ ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \
+ conn_opt.o ip_attr.o ip_dce.o \
+ $(IP_ICMP_OBJS) \
+ $(IP_RTS_OBJS) \
+ $(IP_TCP_OBJS) \
+ $(IP_UDP_OBJS) \
+ $(IP_SCTP_OBJS) \
+ $(IP_ILB_OBJS)
+
+IP6_OBJS += ip6ddi.o
+
+HOOK_OBJS += hook.o
+
+NETI_OBJS += neti_impl.o neti_mod.o neti_stack.o
+
+KEYSOCK_OBJS += keysockddi.o keysock.o keysock_opt_data.o
+
+IPNET_OBJS += ipnet.o ipnet_bpf.o
+
+SPDSOCK_OBJS += spdsockddi.o spdsock.o spdsock_opt_data.o
+
+IPSECESP_OBJS += ipsecespddi.o ipsecesp.o
+
+IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o
+
+SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o
+
+SPPPTUN_OBJS += sppptun.o sppptun_mod.o
+
+SPPPASYN_OBJS += spppasyn.o spppasyn_mod.o
+
+SPPPCOMP_OBJS += spppcomp.o spppcomp_mod.o deflate.o bsd-comp.o vjcompress.o \
+ zlib.o
+
+TCP_OBJS += tcpddi.o
+
+TCP6_OBJS += tcp6ddi.o
+
+NCA_OBJS += ncaddi.o
+
+SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o
+
+SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o
+
+PFP_SOCK_MOD_OBJS += sockmod_pfp.o
+
+RDS_SOCK_MOD_OBJS += sockmod_rds.o
+
+RDS_OBJS += rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o
+
+RDSIB_OBJS += rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \
+ rdsib_debug.o rdsib_sc.o
+
+RDSV3_OBJS += af_rds.o rdsv3_ddi.o bind.o loop.o threads.o connection.o \
+ transport.o cong.o sysctl.o message.o rds_recv.o send.o \
+ stats.o info.o page.o rdma_transport.o ib_ring.o ib_rdma.o \
+ ib_recv.o ib.o ib_send.o ib_sysctl.o ib_stats.o ib_cm.o \
+ rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o rdsv3_af_thr.o
+
+ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \
+ iser_resource.o iser_xfer.o
+
+UDP_OBJS += udpddi.o
+
+UDP6_OBJS += udp6ddi.o
+
+SY_OBJS += gentty.o
+
+TCO_OBJS += ticots.o
+
+TCOO_OBJS += ticotsord.o
+
+TCL_OBJS += ticlts.o
+
+TL_OBJS += tl.o
+
+DUMP_OBJS += dump.o
+
+BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o
+
+CLONE_OBJS += clone.o
+
+CN_OBJS += cons.o
+
+DLD_OBJS += dld_drv.o dld_proto.o dld_str.o dld_flow.o
+
+DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o
+
+GLD_OBJS += gld.o gldutil.o
+
+MAC_OBJS += mac.o mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \
+ mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \
+ mac_protect.o mac_soft_ring.o mac_stat.o mac_util.o
+
+MAC_6TO4_OBJS += mac_6to4.o
+
+MAC_ETHER_OBJS += mac_ether.o
+
+MAC_IPV4_OBJS += mac_ipv4.o
+
+MAC_IPV6_OBJS += mac_ipv6.o
+
+MAC_WIFI_OBJS += mac_wifi.o
+
+MAC_IB_OBJS += mac_ib.o
+
+IPTUN_OBJS += iptun_dev.o iptun_ctl.o iptun.o
+
+AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
+ aggr_send.o aggr_recv.o aggr_lacp.o
+
+SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \
+ softmac_dev.o softmac_stat.o softmac_pkt.o softmac_fp.o
+
+NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
+ net80211_output.o net80211_node.o net80211_crypto.o \
+ net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \
+ net80211_crypto_tkip.o net80211_crypto_ccmp.o \
+ net80211_ht.o
+
+VNIC_OBJS += vnic_ctl.o vnic_dev.o
+
+SIMNET_OBJS += simnet.o
+
+IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o
+
+IBCM_OBJS += ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \
+ ibcm_arp.o ibcm_arp_link.o
+
+IBDM_OBJS += ibdm.o
+
+IBDMA_OBJS += ibdma.o
+
+IBMF_OBJS += ibmf.o ibmf_impl.o ibmf_dr.o ibmf_wqe.o ibmf_ud_dest.o ibmf_mod.o \
+ ibmf_send.o ibmf_recv.o ibmf_handlers.o ibmf_trans.o \
+ ibmf_timers.o ibmf_msg.o ibmf_utils.o ibmf_rmpp.o \
+ ibmf_saa.o ibmf_saa_impl.o ibmf_saa_utils.o ibmf_saa_events.o
+
+IBTL_OBJS += ibtl_impl.o ibtl_util.o ibtl_mem.o ibtl_handlers.o ibtl_qp.o \
+ ibtl_cq.o ibtl_wr.o ibtl_hca.o ibtl_chan.o ibtl_cm.o \
+ ibtl_mcg.o ibtl_ibnex.o ibtl_srq.o ibtl_part.o
+
+TAVOR_OBJS += tavor.o tavor_agents.o tavor_cfg.o tavor_ci.o tavor_cmd.o \
+ tavor_cq.o tavor_event.o tavor_ioctl.o tavor_misc.o \
+ tavor_mr.o tavor_qp.o tavor_qpmod.o tavor_rsrc.o \
+ tavor_srq.o tavor_stats.o tavor_umap.o tavor_wr.o
+
+HERMON_OBJS += hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \
+ hermon_cq.o hermon_event.o hermon_ioctl.o hermon_misc.o \
+ hermon_mr.o hermon_qp.o hermon_qpmod.o hermon_rsrc.o \
+ hermon_srq.o hermon_stats.o hermon_umap.o hermon_wr.o \
+ hermon_fcoib.o hermon_fm.o
+
+DAPLT_OBJS += daplt.o
+
+SOL_OFS_OBJS += sol_cma.o sol_ib_cma.o sol_uobj.o \
+ sol_ofs_debug_util.o sol_ofs_gen_util.o \
+ sol_kverbs.o
+
+SOL_UCMA_OBJS += sol_ucma.o
+
+SOL_UVERBS_OBJS += sol_uverbs.o sol_uverbs_comp.o sol_uverbs_event.o \
+ sol_uverbs_hca.o sol_uverbs_qp.o
+
+SOL_UMAD_OBJS += sol_umad.o
+
+KSTAT_OBJS += kstat.o
+
+KSYMS_OBJS += ksyms.o
+
+INSTANCE_OBJS += inst_sync.o
+
+IWSCN_OBJS += iwscons.o
+
+LOFI_OBJS += lofi.o LzmaDec.o
+
+FSSNAP_OBJS += fssnap.o
+
+FSSNAPIF_OBJS += fssnap_if.o
+
+MM_OBJS += mem.o
+
+PHYSMEM_OBJS += physmem.o
+
+OPTIONS_OBJS += options.o
+
+WINLOCK_OBJS += winlockio.o
+
+PM_OBJS += pm.o
+SRN_OBJS += srn.o
+
+PSEUDO_OBJS += pseudonex.o
+
+RAMDISK_OBJS += ramdisk.o
+
+LLC1_OBJS += llc1.o
+
+USBKBM_OBJS += usbkbm.o
+
+USBWCM_OBJS += usbwcm.o
+
+BOFI_OBJS += bofi.o
+
+HID_OBJS += hid.o
+
+HWA_RC_OBJS += hwarc.o
+
+USBSKEL_OBJS += usbskel.o
+
+USBVC_OBJS += usbvc.o usbvc_v4l2.o
+
+HIDPARSER_OBJS += hidparser.o
+
+USB_AC_OBJS += usb_ac.o
+
+USB_AS_OBJS += usb_as.o
+
+USB_AH_OBJS += usb_ah.o
+
+USBMS_OBJS += usbms.o
+
+USBPRN_OBJS += usbprn.o
+
+UGEN_OBJS += ugen.o
+
+USBSER_OBJS += usbser.o usbser_rseq.o
+
+USBSACM_OBJS += usbsacm.o
+
+USBSER_KEYSPAN_OBJS += usbser_keyspan.o keyspan_dsd.o keyspan_pipe.o
+
+USBS49_FW_OBJS += keyspan_49fw.o
+
+USBSPRL_OBJS += usbser_pl2303.o pl2303_dsd.o
+
+WUSB_CA_OBJS += wusb_ca.o
+
+USBFTDI_OBJS += usbser_uftdi.o uftdi_dsd.o
+
+USBECM_OBJS += usbecm.o
+
+WC_OBJS += wscons.o vcons.o
+
+VCONS_CONF_OBJS += vcons_conf.o
+
+SCSI_OBJS += scsi_capabilities.o scsi_confsubr.o scsi_control.o \
+ scsi_data.o scsi_fm.o scsi_hba.o scsi_reset_notify.o \
+ scsi_resource.o scsi_subr.o scsi_transport.o scsi_watch.o \
+ smp_transport.o
+
+SCSI_VHCI_OBJS += scsi_vhci.o mpapi_impl.o scsi_vhci_tpgs.o
+
+SCSI_VHCI_F_SYM_OBJS += sym.o
+
+SCSI_VHCI_F_TPGS_OBJS += tpgs.o
+
+SCSI_VHCI_F_ASYM_SUN_OBJS += asym_sun.o
+
+SCSI_VHCI_F_SYM_HDS_OBJS += sym_hds.o
+
+SCSI_VHCI_F_TAPE_OBJS += tape.o
+
+SCSI_VHCI_F_TPGS_TAPE_OBJS += tpgs_tape.o
+
+SGEN_OBJS += sgen.o
+
+SMP_OBJS += smp.o
+
+SATA_OBJS += sata.o
+
+USBA_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \
+ usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \
+ usba_devdb.o usba10_calls.o usba_ugen.o whcdi.o wa.o
+USBA_WITHOUT_WUSB_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \
+ usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \
+ usba_devdb.o usba10_calls.o usba_ugen.o
+
+USBA10_OBJS += usba10.o
+
+RSM_OBJS += rsm.o rsmka_pathmanager.o rsmka_util.o
+
+RSMOPS_OBJS += rsmops.o
+
+S1394_OBJS += t1394.o t1394_errmsg.o s1394.o s1394_addr.o s1394_asynch.o \
+ s1394_bus_reset.o s1394_cmp.o s1394_csr.o s1394_dev_disc.o \
+ s1394_fa.o s1394_fcp.o \
+ s1394_hotplug.o s1394_isoch.o s1394_misc.o h1394.o nx1394.o
+
+HCI1394_OBJS += hci1394.o hci1394_async.o hci1394_attach.o hci1394_buf.o \
+ hci1394_csr.o hci1394_detach.o hci1394_extern.o \
+ hci1394_ioctl.o hci1394_isoch.o hci1394_isr.o \
+ hci1394_ixl_comp.o hci1394_ixl_isr.o hci1394_ixl_misc.o \
+ hci1394_ixl_update.o hci1394_misc.o hci1394_ohci.o \
+ hci1394_q.o hci1394_s1394if.o hci1394_tlabel.o \
+ hci1394_tlist.o hci1394_vendor.o
+
+AV1394_OBJS += av1394.o av1394_as.o av1394_async.o av1394_cfgrom.o \
+ av1394_cmp.o av1394_fcp.o av1394_isoch.o av1394_isoch_chan.o \
+ av1394_isoch_recv.o av1394_isoch_xmit.o av1394_list.o \
+ av1394_queue.o
+
+DCAM1394_OBJS += dcam.o dcam_frame.o dcam_param.o dcam_reg.o \
+ dcam_ring_buff.o
+
+SCSA1394_OBJS += hba.o sbp2_driver.o sbp2_bus.o
+
+SBP2_OBJS += cfgrom.o sbp2.o
+
+PMODEM_OBJS += pmodem.o pmodem_cis.o cis.o cis_callout.o cis_handlers.o cis_params.o
+
+DSW_OBJS += dsw.o dsw_dev.o ii_tree.o
+
+NCALL_OBJS += ncall.o \
+ ncall_stub.o
+
+RDC_OBJS += rdc.o \
+ rdc_dev.o \
+ rdc_io.o \
+ rdc_clnt.o \
+ rdc_prot_xdr.o \
+ rdc_svc.o \
+ rdc_bitmap.o \
+ rdc_health.o \
+ rdc_subr.o \
+ rdc_diskq.o
+
+RDCSRV_OBJS += rdcsrv.o
+
+RDCSTUB_OBJS += rdc_stub.o
+
+SDBC_OBJS += sd_bcache.o \
+ sd_bio.o \
+ sd_conf.o \
+ sd_ft.o \
+ sd_hash.o \
+ sd_io.o \
+ sd_misc.o \
+ sd_pcu.o \
+ sd_tdaemon.o \
+ sd_trace.o \
+ sd_iob_impl0.o \
+ sd_iob_impl1.o \
+ sd_iob_impl2.o \
+ sd_iob_impl3.o \
+ sd_iob_impl4.o \
+ sd_iob_impl5.o \
+ sd_iob_impl6.o \
+ sd_iob_impl7.o \
+ safestore.o \
+ safestore_ram.o
+
+NSCTL_OBJS += nsctl.o \
+ nsc_cache.o \
+ nsc_disk.o \
+ nsc_dev.o \
+ nsc_freeze.o \
+ nsc_gen.o \
+ nsc_mem.o \
+ nsc_ncallio.o \
+ nsc_power.o \
+ nsc_resv.o \
+ nsc_rmspin.o \
+ nsc_solaris.o \
+ nsc_trap.o \
+ nsc_list.o
+UNISTAT_OBJS += spuni.o \
+ spcs_s_k.o
+
+NSKERN_OBJS += nsc_ddi.o \
+ nsc_proc.o \
+ nsc_raw.o \
+ nsc_thread.o \
+ nskernd.o
+
+SV_OBJS += sv.o
+
+PMCS_OBJS += pmcs_attach.o pmcs_ds.o pmcs_intr.o pmcs_nvram.o pmcs_sata.o \
+ pmcs_scsa.o pmcs_smhba.o pmcs_subr.o pmcs_fwlog.o
+
+PMCS8001FW_C_OBJS += pmcs_fw_hdr.o
+PMCS8001FW_OBJS += $(PMCS8001FW_C_OBJS) SPCBoot.o ila.o firmware.o
+
+#
+# Build up defines and paths.
+
+ST_OBJS += st.o st_conf.o
+
+EMLXS_OBJS += emlxs_clock.o emlxs_dfc.o emlxs_dhchap.o emlxs_diag.o \
+ emlxs_download.o emlxs_dump.o emlxs_els.o emlxs_event.o \
+ emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \
+ emlxs_mbox.o emlxs_mem.o emlxs_msg.o emlxs_node.o \
+ emlxs_pkt.o emlxs_sli3.o emlxs_sli4.o emlxs_solaris.o \
+ emlxs_thread.o
+
+EMLXS_FW_OBJS += emlxs_fw.o
+
+OCE_OBJS += oce_buf.o oce_fm.o oce_gld.o oce_hw.o oce_intr.o oce_main.o \
+ oce_mbx.o oce_mq.o oce_queue.o oce_rx.o oce_stat.o oce_tx.o \
+ oce_utils.o
+
+FCT_OBJS += discovery.o fct.o
+
+QLT_OBJS += 2400.o 2500.o 8100.o qlt.o qlt_dma.o
+
+SRPT_OBJS += srpt_mod.o srpt_ch.o srpt_cm.o srpt_ioc.o srpt_stp.o
+
+FCOE_OBJS += fcoe.o fcoe_eth.o fcoe_fc.o
+
+FCOET_OBJS += fcoet.o fcoet_eth.o fcoet_fc.o
+
+FCOEI_OBJS += fcoei.o fcoei_eth.o fcoei_lv.o
+
+ISCSIT_SHARED_OBJS += \
+ iscsit_common.o
+
+ISCSIT_OBJS += $(ISCSIT_SHARED_OBJS) \
+ iscsit.o iscsit_tgt.o iscsit_sess.o iscsit_login.o \
+ iscsit_text.o iscsit_isns.o iscsit_radiusauth.o \
+ iscsit_radiuspacket.o iscsit_auth.o iscsit_authclient.o
+
+PPPT_OBJS += alua_ic_if.o pppt.o pppt_msg.o pppt_tgt.o
+
+STMF_OBJS += lun_map.o stmf.o
+
+STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o sbd_zvol.o
+
+SYSMSG_OBJS += sysmsg.o
+
+SES_OBJS += ses.o ses_sen.o ses_safte.o ses_ses.o
+
+TNF_OBJS += tnf_buf.o tnf_trace.o tnf_writer.o trace_init.o \
+ trace_funcs.o tnf_probe.o tnf.o
+
+LOGINDMUX_OBJS += logindmux.o
+
+DEVINFO_OBJS += devinfo.o
+
+DEVPOLL_OBJS += devpoll.o
+
+DEVPOOL_OBJS += devpool.o
+
+I8042_OBJS += i8042.o
+
+KB8042_OBJS += \
+ at_keyprocess.o \
+ kb8042.o \
+ kb8042_keytables.o
+
+MOUSE8042_OBJS += mouse8042.o
+
+FDC_OBJS += fdc.o
+
+ASY_OBJS += asy.o
+
+ECPP_OBJS += ecpp.o
+
+VUIDM3P_OBJS += vuidmice.o vuidm3p.o
+
+VUIDM4P_OBJS += vuidmice.o vuidm4p.o
+
+VUIDM5P_OBJS += vuidmice.o vuidm5p.o
+
+VUIDPS2_OBJS += vuidmice.o vuidps2.o
+
+HPCSVC_OBJS += hpcsvc.o
+
+PCIE_MISC_OBJS += pcie.o pcie_fault.o pcie_hp.o pciehpc.o pcishpc.o pcie_pwr.o pciev.o
+
+PCIHPNEXUS_OBJS += pcihp.o
+
+OPENEEPR_OBJS += openprom.o
+
+RANDOM_OBJS += random.o
+
+PSHOT_OBJS += pshot.o
+
+GEN_DRV_OBJS += gen_drv.o
+
+TCLIENT_OBJS += tclient.o
+
+TPHCI_OBJS += tphci.o
+
+TVHCI_OBJS += tvhci.o
+
+EMUL64_OBJS += emul64.o emul64_bsd.o
+
+FCP_OBJS += fcp.o
+
+FCIP_OBJS += fcip.o
+
+FCSM_OBJS += fcsm.o
+
+FCTL_OBJS += fctl.o
+
+FP_OBJS += fp.o
+
+QLC_OBJS += ql_api.o ql_debug.o ql_hba_fru.o ql_init.o ql_iocb.o ql_ioctl.o \
+ ql_isr.o ql_mbx.o ql_nx.o ql_xioctl.o ql_fw_table.o
+
+QLC_FW_2200_OBJS += ql_fw_2200.o
+
+QLC_FW_2300_OBJS += ql_fw_2300.o
+
+QLC_FW_2400_OBJS += ql_fw_2400.o
+
+QLC_FW_2500_OBJS += ql_fw_2500.o
+
+QLC_FW_6322_OBJS += ql_fw_6322.o
+
+QLC_FW_8100_OBJS += ql_fw_8100.o
+
+QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o
+
+ZCONS_OBJS += zcons.o
+
+NV_SATA_OBJS += nv_sata.o
+
+SI3124_OBJS += si3124.o
+
+AHCI_OBJS += ahci.o
+
+PCIIDE_OBJS += pci-ide.o
+
+PCEPP_OBJS += pcepp.o
+
+CPC_OBJS += cpc.o
+
+CPUID_OBJS += cpuid_drv.o
+
+SYSEVENT_OBJS += sysevent.o
+
+BL_OBJS += bl.o
+
+DRM_OBJS += drm_sunmod.o drm_kstat.o drm_agpsupport.o \
+ drm_auth.o drm_bufs.o drm_context.o drm_dma.o \
+ drm_drawable.o drm_drv.o drm_fops.o drm_ioctl.o drm_irq.o \
+ drm_lock.o drm_memory.o drm_msg.o drm_pci.o drm_scatter.o \
+ drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o
+
+FM_OBJS += devfm.o devfm_machdep.o
+
+RTLS_OBJS += rtls.o
+
+#
+# exec modules
+#
+AOUTEXEC_OBJS +=aout.o
+
+ELFEXEC_OBJS += elf.o elf_notes.o old_notes.o
+
+INTPEXEC_OBJS +=intp.o
+
+SHBINEXEC_OBJS +=shbin.o
+
+JAVAEXEC_OBJS +=java.o
+
+#
+# file system modules
+#
+AUTOFS_OBJS += auto_vfsops.o auto_vnops.o auto_subr.o auto_xdr.o auto_sys.o
+
+CACHEFS_OBJS += cachefs_cnode.o cachefs_cod.o \
+ cachefs_dir.o cachefs_dlog.o cachefs_filegrp.o \
+ cachefs_fscache.o cachefs_ioctl.o cachefs_log.o \
+ cachefs_module.o \
+ cachefs_noopc.o cachefs_resource.o \
+ cachefs_strict.o \
+ cachefs_subr.o cachefs_vfsops.o \
+ cachefs_vnops.o
+
+DCFS_OBJS += dc_vnops.o
+
+DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o
+
+DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \
+ sdev_ptsops.o sdev_zvolops.o sdev_comm.o \
+ sdev_profile.o sdev_ncache.o sdev_netops.o \
+ sdev_ipnetops.o \
+ sdev_vtops.o
+
+CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
+ ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
+
+OBJFS_OBJS += objfs_vfs.o objfs_root.o objfs_common.o \
+ objfs_odir.o objfs_data.o
+
+FDFS_OBJS += fdops.o
+
+FIFO_OBJS += fifosubr.o fifovnops.o
+
+PIPE_OBJS += pipe.o
+
+HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \
+ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o
+
+LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o
+
+NAMEFS_OBJS += namevfs.o namevno.o
+
+NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \
+ nfs_subr.o nfs_vfsops.o nfs_vnops.o \
+ nfs_xdr.o nfs_sys.o nfs_strerror.o \
+ nfs3_vfsops.o nfs3_vnops.o nfs3_xdr.o \
+ nfs_acl_vnops.o nfs_acl_xdr.o nfs4_vfsops.o \
+ nfs4_vnops.o nfs4_xdr.o nfs4_idmap.o \
+ nfs4_shadow.o nfs4_subr.o \
+ nfs4_attr.o nfs4_rnode.o nfs4_client.o \
+ nfs4_acache.o nfs4_common.o nfs4_client_state.o \
+ nfs4_callback.o nfs4_recovery.o nfs4_client_secinfo.o \
+ nfs4_client_debug.o nfs_stats.o \
+ nfs4_acl.o nfs4_stub_vnops.o nfs_cmd.o
+
+NFSSRV_OBJS += nfs_server.o nfs_srv.o nfs3_srv.o \
+ nfs_acl_srv.o nfs_auth.o nfs_auth_xdr.o \
+ nfs_export.o nfs_log.o nfs_log_xdr.o \
+ nfs4_srv.o nfs4_state.o nfs4_srv_attr.o \
+ nfs4_srv_ns.o nfs4_db.o nfs4_srv_deleg.o \
+ nfs4_deleg_ops.o nfs4_srv_readdir.o nfs4_dispatch.o
+
+SMBSRV_SHARED_OBJS += \
+ smb_inet.o \
+ smb_match.o \
+ smb_msgbuf.o \
+ smb_oem.o \
+ smb_string.o \
+ smb_utf8.o \
+ smb_door_legacy.o \
+ smb_xdr.o \
+ smb_token.o \
+ smb_token_xdr.o \
+ smb_sid.o \
+ smb_native.o \
+ smb_netbios_util.o
+
+SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \
+ smb_acl.o \
+ smb_alloc.o \
+ smb_close.o \
+ smb_common_open.o \
+ smb_common_transact.o \
+ smb_create.o \
+ smb_delete.o \
+ smb_directory.o \
+ smb_dispatch.o \
+ smb_echo.o \
+ smb_fem.o \
+ smb_find.o \
+ smb_flush.o \
+ smb_fsinfo.o \
+ smb_fsops.o \
+ smb_init.o \
+ smb_kdoor.o \
+ smb_kshare.o \
+ smb_kutil.o \
+ smb_lock.o \
+ smb_lock_byte_range.o \
+ smb_locking_andx.o \
+ smb_logoff_andx.o \
+ smb_mangle_name.o \
+ smb_mbuf_marshaling.o \
+ smb_mbuf_util.o \
+ smb_negotiate.o \
+ smb_net.o \
+ smb_node.o \
+ smb_nt_cancel.o \
+ smb_nt_create_andx.o \
+ smb_nt_transact_create.o \
+ smb_nt_transact_ioctl.o \
+ smb_nt_transact_notify_change.o \
+ smb_nt_transact_quota.o \
+ smb_nt_transact_security.o \
+ smb_odir.o \
+ smb_ofile.o \
+ smb_open_andx.o \
+ smb_opipe.o \
+ smb_oplock.o \
+ smb_pathname.o \
+ smb_print.o \
+ smb_process_exit.o \
+ smb_query_fileinfo.o \
+ smb_read.o \
+ smb_rename.o \
+ smb_sd.o \
+ smb_seek.o \
+ smb_server.o \
+ smb_session.o \
+ smb_session_setup_andx.o \
+ smb_set_fileinfo.o \
+ smb_signing.o \
+ smb_tree.o \
+ smb_trans2_create_directory.o \
+ smb_trans2_dfs.o \
+ smb_trans2_find.o \
+ smb_tree_connect.o \
+ smb_unlock_byte_range.o \
+ smb_user.o \
+ smb_vfs.o \
+ smb_vops.o \
+ smb_vss.o \
+ smb_write.o \
+ smb_write_raw.o
+
+PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \
+ pc_vfsops.o pc_vnops.o
+
+PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \
+ prvfsops.o prvnops.o
+
+MNTFS_OBJS += mntvfsops.o mntvnops.o
+
+SHAREFS_OBJS += sharetab.o sharefs_vfsops.o sharefs_vnops.o
+
+SPEC_OBJS += specsubr.o specvfsops.o specvnops.o
+
+SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \
+ socksyscalls.o socktpi.o sockstr.o \
+ sockcommon_vnops.o sockcommon_subr.o \
+ sockcommon_sops.o sockcommon.o \
+ sock_notsupp.o socknotify.o \
+ nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \
+ nl7cnca.o sodirect.o sockfilter.o
+
+TMPFS_OBJS += tmp_dir.o tmp_subr.o tmp_tnode.o tmp_vfsops.o \
+ tmp_vnops.o
+
+UDFS_OBJS += udf_alloc.o udf_bmap.o udf_dir.o \
+ udf_inode.o udf_subr.o udf_vfsops.o \
+ udf_vnops.o
+
+UFS_OBJS += ufs_alloc.o ufs_bmap.o ufs_dir.o ufs_xattr.o \
+ ufs_inode.o ufs_subr.o ufs_tables.o ufs_vfsops.o \
+ ufs_vnops.o quota.o quotacalls.o quota_ufs.o \
+ ufs_filio.o ufs_lockfs.o ufs_thread.o ufs_trans.o \
+ ufs_acl.o ufs_panic.o ufs_directio.o ufs_log.o \
+ ufs_extvnops.o ufs_snap.o lufs.o lufs_thread.o \
+ lufs_log.o lufs_map.o lufs_top.o lufs_debug.o
+VSCAN_OBJS += vscan_drv.o vscan_svc.o vscan_door.o
+
+NSMB_OBJS += smb_conn.o smb_dev.o smb_iod.o smb_pass.o \
+ smb_rq.o smb_sign.o smb_smb.o smb_subrs.o \
+ smb_time.o smb_tran.o smb_trantcp.o smb_usr.o \
+ subr_mchain.o
+
+SMBFS_COMMON_OBJS += smbfs_ntacl.o
+SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \
+ smbfs_acl.o smbfs_client.o smbfs_smb.o \
+ smbfs_subr.o smbfs_subr2.o \
+ smbfs_rwlock.o smbfs_xattr.o \
+ $(SMBFS_COMMON_OBJS)
+
+
+#
+# LVM modules
+#
+MD_OBJS += md.o md_error.o md_ioctl.o md_mddb.o md_names.o \
+ md_med.o md_rename.o md_subr.o
+
+MD_COMMON_OBJS = md_convert.o md_crc.o md_revchk.o
+
+MD_DERIVED_OBJS = metamed_xdr.o meta_basic_xdr.o
+
+SOFTPART_OBJS += sp.o sp_ioctl.o
+
+STRIPE_OBJS += stripe.o stripe_ioctl.o
+
+HOTSPARES_OBJS += hotspares.o
+
+RAID_OBJS += raid.o raid_ioctl.o raid_replay.o raid_resync.o raid_hotspare.o
+
+MIRROR_OBJS += mirror.o mirror_ioctl.o mirror_resync.o
+
+NOTIFY_OBJS += md_notify.o
+
+TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
+
+ZFS_COMMON_OBJS += \
+ arc.o \
+ bplist.o \
+ bpobj.o \
+ dbuf.o \
+ ddt.o \
+ ddt_zap.o \
+ dmu.o \
+ dmu_diff.o \
+ dmu_send.o \
+ dmu_object.o \
+ dmu_objset.o \
+ dmu_traverse.o \
+ dmu_tx.o \
+ dnode.o \
+ dnode_sync.o \
+ dsl_dir.o \
+ dsl_dataset.o \
+ dsl_deadlist.o \
+ dsl_pool.o \
+ dsl_synctask.o \
+ dmu_zfetch.o \
+ dsl_deleg.o \
+ dsl_prop.o \
+ dsl_scan.o \
+ gzip.o \
+ lzjb.o \
+ metaslab.o \
+ refcount.o \
+ sa.o \
+ sha256.o \
+ spa.o \
+ spa_config.o \
+ spa_errlog.o \
+ spa_history.o \
+ spa_misc.o \
+ space_map.o \
+ txg.o \
+ uberblock.o \
+ unique.o \
+ vdev.o \
+ vdev_cache.o \
+ vdev_file.o \
+ vdev_label.o \
+ vdev_mirror.o \
+ vdev_missing.o \
+ vdev_queue.o \
+ vdev_raidz.o \
+ vdev_root.o \
+ zap.o \
+ zap_leaf.o \
+ zap_micro.o \
+ zfs_byteswap.o \
+ zfs_debug.o \
+ zfs_fm.o \
+ zfs_fuid.o \
+ zfs_sa.o \
+ zfs_znode.o \
+ zil.o \
+ zio.o \
+ zio_checksum.o \
+ zio_compress.o \
+ zio_inject.o \
+ zle.o \
+ zrlock.o
+
+ZFS_SHARED_OBJS += \
+ zfs_namecheck.o \
+ zfs_deleg.o \
+ zfs_prop.o \
+ zfs_comutil.o \
+ zfs_fletcher.o \
+ zpool_prop.o \
+ zprop_common.o
+
+ZFS_OBJS += \
+ $(ZFS_COMMON_OBJS) \
+ $(ZFS_SHARED_OBJS) \
+ vdev_disk.o \
+ zfs_acl.o \
+ zfs_ctldir.o \
+ zfs_dir.o \
+ zfs_ioctl.o \
+ zfs_log.o \
+ zfs_onexit.o \
+ zfs_replay.o \
+ zfs_rlock.o \
+ rrwlock.o \
+ zfs_vfsops.o \
+ zfs_vnops.o \
+ zvol.o
+
+ZUT_OBJS += \
+ zut.o
+
+#
+# streams modules
+#
+BUFMOD_OBJS += bufmod.o
+
+CONNLD_OBJS += connld.o
+
+DEDUMP_OBJS += dedump.o
+
+DRCOMPAT_OBJS += drcompat.o
+
+LDLINUX_OBJS += ldlinux.o
+
+LDTERM_OBJS += ldterm.o uwidth.o
+
+PCKT_OBJS += pckt.o
+
+PFMOD_OBJS += pfmod.o
+
+PTEM_OBJS += ptem.o
+
+REDIRMOD_OBJS += strredirm.o
+
+TIMOD_OBJS += timod.o
+
+TIRDWR_OBJS += tirdwr.o
+
+TTCOMPAT_OBJS +=ttcompat.o
+
+LOG_OBJS += log.o
+
+PIPEMOD_OBJS += pipemod.o
+
+RPCMOD_OBJS += rpcmod.o clnt_cots.o clnt_clts.o \
+ clnt_gen.o clnt_perr.o mt_rpcinit.o rpc_calmsg.o \
+ rpc_prot.o rpc_sztypes.o rpc_subr.o rpcb_prot.o \
+ svc.o svc_clts.o svc_gen.o svc_cots.o \
+ rpcsys.o xdr_sizeof.o clnt_rdma.o svc_rdma.o \
+ xdr_rdma.o rdma_subr.o xdrrdma_sizeof.o
+
+TLIMOD_OBJS += tlimod.o t_kalloc.o t_kbind.o t_kclose.o \
+ t_kconnect.o t_kfree.o t_kgtstate.o t_kopen.o \
+ t_krcvudat.o t_ksndudat.o t_kspoll.o t_kunbind.o \
+ t_kutil.o
+
+RLMOD_OBJS += rlmod.o
+
+TELMOD_OBJS += telmod.o
+
+CRYPTMOD_OBJS += cryptmod.o
+
+KB_OBJS += kbd.o keytables.o
+
+#
+# ID mapping module
+#
+IDMAP_OBJS += idmap_mod.o idmap_kapi.o idmap_xdr.o idmap_cache.o
+
+#
+# scheduling class modules
+#
+SDC_OBJS += sysdc.o
+
+RT_OBJS += rt.o
+RT_DPTBL_OBJS += rt_dptbl.o
+
+TS_OBJS += ts.o
+TS_DPTBL_OBJS += ts_dptbl.o
+
+IA_OBJS += ia.o
+
+FSS_OBJS += fss.o
+
+FX_OBJS += fx.o
+FX_DPTBL_OBJS += fx_dptbl.o
+
+#
+# Inter-Process Communication (IPC) modules
+#
+IPC_OBJS += ipc.o
+
+IPCMSG_OBJS += msg.o
+
+IPCSEM_OBJS += sem.o
+
+IPCSHM_OBJS += shm.o
+
+#
+# bignum module
+#
+COMMON_BIGNUM_OBJS += bignum_mod.o bignumimpl.o
+
+BIGNUM_OBJS += $(COMMON_BIGNUM_OBJS) $(BIGNUM_PSR_OBJS)
+
+#
+# kernel cryptographic framework
+#
+KCF_OBJS += kcf.o kcf_callprov.o kcf_cbufcall.o kcf_cipher.o kcf_crypto.o \
+ kcf_cryptoadm.o kcf_ctxops.o kcf_digest.o kcf_dual.o \
+ kcf_keys.o kcf_mac.o kcf_mech_tabs.o kcf_miscapi.o \
+ kcf_object.o kcf_policy.o kcf_prov_lib.o kcf_prov_tabs.o \
+ kcf_sched.o kcf_session.o kcf_sign.o kcf_spi.o kcf_verify.o \
+ kcf_random.o modes.o ecb.o cbc.o ctr.o ccm.o gcm.o \
+ fips_random.o fips_checksum.o fips_test_vectors.o
+
+CRYPTOADM_OBJS += cryptoadm.o
+
+CRYPTO_OBJS += crypto.o
+
+DPROV_OBJS += dprov.o
+
+DCA_OBJS += dca.o dca_3des.o dca_debug.o dca_dsa.o dca_kstat.o dca_rng.o \
+ dca_rsa.o
+
+AESPROV_OBJS += aes.o aes_impl.o aes_modes.o fips_aes_util.o
+
+ARCFOURPROV_OBJS += arcfour.o arcfour_crypt.o
+
+BLOWFISHPROV_OBJS += blowfish.o blowfish_impl.o
+
+ECCPROV_OBJS += ecc.o ec.o ec2_163.o ec2_mont.o ecdecode.o ecl_mult.o \
+ ecp_384.o ecp_jac.o ec2_193.o ecl.o ecp_192.o ecp_521.o \
+ ecp_jm.o ec2_233.o ecl_curve.o ecp_224.o ecp_aff.o \
+ ecp_mont.o ec2_aff.o ec_naf.o ecl_gf.o ecp_256.o mp_gf2m.o \
+ mpi.o mplogic.o mpmontg.o mpprime.o oid.o \
+ secitem.o ec2_test.o ecp_test.o fips_ecc_util.o
+
+RSAPROV_OBJS += rsa.o rsa_impl.o pkcs1.o fips_rsa_util.o
+
+SWRANDPROV_OBJS += swrand.o fips_random_util.o
+
+#
+# kernel SSL
+#
+KSSL_OBJS += kssl.o ksslioctl.o
+
+KSSL_SOCKFIL_MOD_OBJS += ksslfilter.o ksslapi.o ksslrec.o
+
+#
+# misc. modules
+#
+
+C2AUDIT_OBJS += adr.o audit.o audit_event.o audit_io.o \
+ audit_path.o audit_start.o audit_syscalls.o audit_token.o \
+ audit_mem.o
+
+PCIC_OBJS += pcic.o
+
+RPCSEC_OBJS += secmod.o sec_clnt.o sec_svc.o sec_gen.o \
+ auth_des.o auth_kern.o auth_none.o auth_loopb.o\
+ authdesprt.o authdesubr.o authu_prot.o \
+ key_call.o key_prot.o svc_authu.o svcauthdes.o
+
+RPCSEC_GSS_OBJS += rpcsec_gssmod.o rpcsec_gss.o rpcsec_gss_misc.o \
+ rpcsec_gss_utils.o svc_rpcsec_gss.o
+
+CONSCONFIG_OBJS += consconfig.o
+
+CONSCONFIG_DACF_OBJS += consconfig_dacf.o consplat.o
+
+TEM_OBJS += tem.o tem_safe.o 6x10.o 7x14.o 12x22.o
+
+KBTRANS_OBJS += \
+ kbtrans.o \
+ kbtrans_keytables.o \
+ kbtrans_polled.o \
+ kbtrans_streams.o \
+ usb_keytables.o
+
+KGSSD_OBJS += gssd_clnt_stubs.o gssd_handle.o gssd_prot.o \
+ gss_display_name.o gss_release_name.o gss_import_name.o \
+ gss_release_buffer.o gss_release_oid_set.o gen_oids.o gssdmod.o
+
+KGSSD_DERIVED_OBJS = gssd_xdr.o
+
+KGSS_DUMMY_OBJS += dmech.o
+
+KSOCKET_OBJS += ksocket.o ksocket_mod.o
+
+CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \
+ nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\
+ checksum_length.o hmac.o default_state.o mandatory_sumtype.o
+
+# crypto/des
+CRYPTO_DES= f_cbc.o f_cksum.o f_parity.o weak_key.o d3_cbc.o ef_crypto.o
+
+CRYPTO_DK= checksum.o derive.o dk_decrypt.o dk_encrypt.o
+
+CRYPTO_ARCFOUR= k5_arcfour.o
+
+# crypto/enc_provider
+CRYPTO_ENC= des.o des3.o arcfour_provider.o aes_provider.o
+
+# crypto/hash_provider
+CRYPTO_HASH= hash_kef_generic.o hash_kmd5.o hash_crc32.o hash_ksha1.o
+
+# crypto/keyhash_provider
+CRYPTO_KEYHASH= descbc.o k5_kmd5des.o k_hmac_md5.o
+
+# crypto/crc32
+CRYPTO_CRC32= crc32.o
+
+# crypto/old
+CRYPTO_OLD= old_decrypt.o old_encrypt.o
+
+# crypto/raw
+CRYPTO_RAW= raw_decrypt.o raw_encrypt.o
+
+K5_KRB= kfree.o copy_key.o \
+ parse.o init_ctx.o \
+ ser_adata.o ser_addr.o \
+ ser_auth.o ser_cksum.o \
+ ser_key.o ser_princ.o \
+ serialize.o unparse.o \
+ ser_actx.o
+
+K5_OS= timeofday.o toffset.o \
+ init_os_ctx.o c_ustime.o
+
+SEAL=
+# EXPORT DELETE START
+SEAL= seal.o unseal.o
+# EXPORT DELETE END
+
+MECH= delete_sec_context.o \
+ import_sec_context.o \
+ gssapi_krb5.o \
+ k5seal.o k5unseal.o k5sealv3.o \
+ ser_sctx.o \
+ sign.o \
+ util_crypt.o \
+ util_validate.o util_ordering.o \
+ util_seqnum.o util_set.o util_seed.o \
+ wrap_size_limit.o verify.o
+
+
+
+MECH_GEN= util_token.o
+
+
+KGSS_KRB5_OBJS += krb5mech.o \
+ $(MECH) $(SEAL) $(MECH_GEN) \
+ $(CRYPTO) $(CRYPTO_DES) $(CRYPTO_DK) $(CRYPTO_ARCFOUR) \
+ $(CRYPTO_ENC) $(CRYPTO_HASH) \
+ $(CRYPTO_KEYHASH) $(CRYPTO_CRC32) \
+ $(CRYPTO_OLD) \
+ $(CRYPTO_RAW) $(K5_KRB) $(K5_OS)
+
+DES_OBJS += des_crypt.o des_impl.o des_ks.o des_soft.o fips_des_util.o
+
+DLBOOT_OBJS += bootparam_xdr.o nfs_dlinet.o scan.o
+
+KRTLD_OBJS += kobj_bootflags.o getoptstr.o \
+ kobj.o kobj_kdi.o kobj_lm.o kobj_subr.o
+
+MOD_OBJS += modctl.o modsubr.o modsysfile.o modconf.o modhash.o
+
+STRPLUMB_OBJS += strplumb.o
+
+CPR_OBJS += cpr_driver.o cpr_dump.o \
+ cpr_main.o cpr_misc.o cpr_mod.o cpr_stat.o \
+ cpr_uthread.o
+
+PROF_OBJS += prf.o
+
+SE_OBJS += se_driver.o
+
+SYSACCT_OBJS += acct.o
+
+ACCTCTL_OBJS += acctctl.o
+
+EXACCTSYS_OBJS += exacctsys.o
+
+KAIO_OBJS += aio.o
+
+PCMCIA_OBJS += pcmcia.o cs.o cis.o cis_callout.o cis_handlers.o cis_params.o
+
+BUSRA_OBJS += busra.o
+
+PCS_OBJS += pcs.o
+
+PCAN_OBJS += pcan.o
+
+PCATA_OBJS += pcide.o pcdisk.o pclabel.o pcata.o
+
+PCSER_OBJS += pcser.o pcser_cis.o
+
+PCWL_OBJS += pcwl.o
+
+PSET_OBJS += pset.o
+
+OHCI_OBJS += ohci.o ohci_hub.o ohci_polled.o
+
+UHCI_OBJS += uhci.o uhciutil.o uhcitgt.o uhcihub.o uhcipolled.o
+
+EHCI_OBJS += ehci.o ehci_hub.o ehci_xfer.o ehci_intr.o ehci_util.o ehci_polled.o ehci_isoch.o ehci_isoch_util.o
+
+HUBD_OBJS += hubd.o
+
+USB_MID_OBJS += usb_mid.o
+
+USB_IA_OBJS += usb_ia.o
+
+UWBA_OBJS += uwba.o uwbai.o
+
+SCSA2USB_OBJS += scsa2usb.o usb_ms_bulkonly.o usb_ms_cbi.o
+
+HWAHC_OBJS += hwahc.o hwahc_util.o
+
+WUSB_DF_OBJS += wusb_df.o
+WUSB_FWMOD_OBJS += wusb_fwmod.o
+
+IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \
+ ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \
+ ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o
+
+IBD_OBJS += ibd.o ibd_cm.o
+
+EIBNX_OBJS += enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \
+ enx_misc.o enx_q.o enx_ctl.o
+
+EOIB_OBJS += eib_adm.o eib_chan.o eib_cmn.o eib_ctl.o eib_data.o \
+ eib_fip.o eib_ibt.o eib_log.o eib_mac.o eib_main.o \
+ eib_rsrc.o eib_svc.o eib_vnic.o
+
+DLPISTUB_OBJS += dlpistub.o
+
+SDP_OBJS += sdpddi.o
+
+TRILL_OBJS += trill.o
+
+CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \
+ ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o
+
+SMBIOS_OBJS += smb_error.o smb_info.o smb_open.o smb_subr.o smb_dev.o
+
+RPCIB_OBJS += rpcib.o
+
+KMDB_OBJS += kdrv.o
+
+AFE_OBJS += afe.o
+
+BGE_OBJS += bge_main2.o bge_chip2.o bge_kstats.o bge_log.o bge_ndd.o \
+ bge_atomic.o bge_mii.o bge_send.o bge_recv2.o bge_mii_5906.o
+
+DMFE_OBJS += dmfe_log.o dmfe_main.o dmfe_mii.o
+
+ELXL_OBJS += elxl.o
+
+HME_OBJS += hme.o
+
+IXGB_OBJS += ixgb.o ixgb_atomic.o ixgb_chip.o ixgb_gld.o ixgb_kstats.o \
+ ixgb_log.o ixgb_ndd.o ixgb_rx.o ixgb_tx.o ixgb_xmii.o
+
+NGE_OBJS += nge_main.o nge_atomic.o nge_chip.o nge_ndd.o nge_kstats.o \
+ nge_log.o nge_rx.o nge_tx.o nge_xmii.o
+
+RGE_OBJS += rge_main.o rge_chip.o rge_ndd.o rge_kstats.o rge_log.o rge_rxtx.o
+
+URTW_OBJS += urtw.o
+
+ARN_OBJS += arn_hw.o arn_eeprom.o arn_mac.o arn_calib.o arn_ani.o arn_phy.o arn_regd.o arn_beacon.o \
+ arn_main.o arn_recv.o arn_xmit.o arn_rc.o
+
+ATH_OBJS += ath_aux.o ath_main.o ath_osdep.o ath_rate.o
+
+ATU_OBJS += atu.o
+
+IPW_OBJS += ipw2100_hw.o ipw2100.o
+
+IWI_OBJS += ipw2200_hw.o ipw2200.o
+
+IWH_OBJS += iwh.o
+
+IWK_OBJS += iwk2.o
+
+IWP_OBJS += iwp.o
+
+MWL_OBJS += mwl.o
+
+MWLFW_OBJS += mwlfw_mode.o
+
+WPI_OBJS += wpi.o
+
+RAL_OBJS += rt2560.o ral_rate.o
+
+RUM_OBJS += rum.o
+
+RWD_OBJS += rt2661.o
+
+RWN_OBJS += rt2860.o
+
+UATH_OBJS += uath.o
+
+UATHFW_OBJS += uathfw_mod.o
+
+URAL_OBJS += ural.o
+
+RTW_OBJS += rtw.o smc93cx6.o rtwphy.o rtwphyio.o
+
+ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o
+
+MXFE_OBJS += mxfe.o
+
+MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o mptsas_raid.o mptsas_smhba.o
+
+SFE_OBJS += sfe.o sfe_util.o
+
+BFE_OBJS += bfe.o
+
+BRIDGE_OBJS += bridge.o
+
+IDM_SHARED_OBJS += base64.o
+
+IDM_OBJS += $(IDM_SHARED_OBJS) \
+ idm.o idm_impl.o idm_text.o idm_conn_sm.o idm_so.o
+
+VR_OBJS += vr.o
+
+ATGE_OBJS += atge_main.o atge_l1e.o atge_mii.o atge_l1.o
+
+YGE_OBJS = yge.o
+
+#
+# Build up defines and paths.
+#
+LINT_DEFS += -Dunix
+
+#
+# This duality can be removed when the native and target compilers
+# are the same (or at least recognize the same command line syntax!)
+# It is a bug in the current compilation system that the assember
+# can't process the -Y I, flag.
+#
+NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common
+INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+
+PCIEB_OBJS += pcieb.o
+
+# Chelsio N110 10G NIC driver module
+#
+CH_OBJS = ch.o glue.o pe.o sge.o
+
+CH_COM_OBJS = ch_mac.o ch_subr.o cspi.o espi.o ixf1010.o mc3.o mc4.o mc5.o \
+ mv88e1xxx.o mv88x201x.o my3126.o pm3393.o tp.o ulp.o \
+ vsc7321.o vsc7326.o xpak.o
+
+#
+# PCI strings file
+#
+PCI_STRING_OBJS = pci_strings.o
+
+NET_DACF_OBJS += net_dacf.o
+
+#
+# Xframe 10G NIC driver module
+#
+XGE_OBJS = xge.o xgell.o
+
+XGE_HAL_OBJS = xgehal-channel.o xgehal-fifo.o xgehal-ring.o xgehal-config.o \
+ xgehal-driver.o xgehal-mm.o xgehal-stats.o xgehal-device.o \
+ xge-queue.o xgehal-mgmt.o xgehal-mgmtaux.o
+
+#
+# e1000g module
+#
+E1000G_OBJS += e1000_80003es2lan.o e1000_82540.o e1000_82541.o e1000_82542.o \
+ e1000_82543.o e1000_82571.o e1000_api.o e1000_ich8lan.o \
+ e1000_mac.o e1000_manage.o e1000_nvm.o e1000_osdep.o \
+ e1000_phy.o e1000g_debug.o e1000g_main.o e1000g_alloc.o \
+ e1000g_tx.o e1000g_rx.o e1000g_stat.o
+
+#
+# Intel 82575 1G NIC driver module
+#
+IGB_OBJS = igb_82575.o igb_api.o igb_mac.o igb_manage.o \
+ igb_nvm.o igb_osdep.o igb_phy.o igb_buf.o \
+ igb_debug.o igb_gld.o igb_log.o igb_main.o \
+ igb_rx.o igb_stat.o igb_tx.o
+
+#
+# Intel 10GbE PCIE NIC driver module
+#
+IXGBE_OBJS = ixgbe_82598.o ixgbe_82599.o ixgbe_api.o \
+ ixgbe_common.o ixgbe_phy.o \
+ ixgbe_buf.o ixgbe_debug.o ixgbe_gld.o \
+ ixgbe_log.o ixgbe_main.o \
+ ixgbe_osdep.o ixgbe_rx.o ixgbe_stat.o \
+ ixgbe_tx.o
+
+#
+# NIU 10G/1G driver module
+#
+NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \
+ nxge_txdma.o nxge_txc.o nxge_main.o \
+ nxge_hw.o nxge_fzc.o nxge_virtual.o \
+ nxge_send.o nxge_classify.o nxge_fflp.o \
+ nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o \
+ nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o \
+ nxge_hio.o nxge_hio_guest.o nxge_intr.o
+
+NXGE_NPI_OBJS = \
+ npi.o npi_mac.o npi_ipp.o \
+ npi_txdma.o npi_rxdma.o npi_txc.o \
+ npi_zcp.o npi_espc.o npi_fflp.o \
+ npi_vir.o
+
+NXGE_HCALL_OBJS = \
+ nxge_hcall.o
+
+#
+# kiconv modules
+#
+KICONV_EMEA_OBJS += kiconv_emea.o
+
+#
+# blk2scsa
+#
+BLK2SCSA_OBJS = blk2scsa.o
+
+KICONV_JA_OBJS += kiconv_ja.o
+
+KICONV_KO_OBJS += kiconv_cck_common.o kiconv_ko.o
+
+KICONV_SC_OBJS += kiconv_cck_common.o kiconv_sc.o
+
+KICONV_TC_OBJS += kiconv_cck_common.o kiconv_tc.o
+
+#
+# AAC module
+#
+AAC_OBJS = aac.o aac_ioctl.o
+
+#
+# sdcard modules
+#
+SDA_OBJS = sda_cmd.o sda_host.o sda_init.o sda_mem.o sda_mod.o sda_slot.o
+SDHOST_OBJS = sdhost.o
+
+#
+# hxge 10G driver module
+#
+HXGE_OBJS = hxge_main.o hxge_vmac.o hxge_send.o \
+ hxge_txdma.o hxge_rxdma.o hxge_virtual.o \
+ hxge_fm.o hxge_fzc.o hxge_hw.o hxge_kstats.o \
+ hxge_ndd.o hxge_pfc.o \
+ hpi.o hpi_vmac.o hpi_rxdma.o hpi_txdma.o \
+ hpi_vir.o hpi_pfc.o
+
+#
+# MEGARAID_SAS module
+#
+MEGA_SAS_OBJS = megaraid_sas.o
+
+#
+# MR_SAS module
+#
+MR_SAS_OBJS = mr_sas.o
+
+#
+# ISCSI_INITIATOR module
+#
+ISCSI_INITIATOR_OBJS = chap.o iscsi_io.o iscsi_thread.o \
+ iscsi_ioctl.o iscsid.o iscsi.o \
+ iscsi_login.o isns_client.o iscsiAuthClient.o \
+ iscsi_lun.o iscsiAuthClientGlue.o \
+ iscsi_net.o nvfile.o iscsi_cmd.o \
+ iscsi_queue.o persistent.o iscsi_conn.o \
+ iscsi_sess.o radius_auth.o iscsi_crc.o \
+ iscsi_stats.o radius_packet.o iscsi_doorclt.o \
+ iscsi_targetparam.o utils.o kifconf.o
+
+#
+# ntxn 10Gb/1Gb NIC driver module
+#
+NTXN_OBJS = unm_nic_init.o unm_gem.o unm_nic_hw.o unm_ndd.o \
+ unm_nic_main.o unm_nic_isr.o unm_nic_ctx.o niu.o
+
+#
+# Myricom 10Gb NIC driver module
+#
+MYRI10GE_OBJS = myri10ge.o myri10ge_lro.o
+
+# nulldriver module
+#
+NULLDRIVER_OBJS = nulldriver.o
+
+TPM_OBJS = tpm.o tpm_hcall.o
diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c
index c721386280f8..2a9df6d403f2 100644
--- a/uts/common/dtrace/dtrace.c
+++ b/uts/common/dtrace/dtrace.c
@@ -20,12 +20,9 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DTrace - Dynamic Tracing for Solaris
*
@@ -186,7 +183,9 @@ static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
static dtrace_genid_t dtrace_probegen; /* current probe generation */
static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
+static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
+static int dtrace_dynvar_failclean; /* dynvars failed to clean */
/*
* DTrace Locking
@@ -240,10 +239,16 @@ static void
dtrace_nullop(void)
{}
+static int
+dtrace_enable_nullop(void)
+{
+ return (0);
+}
+
static dtrace_pops_t dtrace_provider_ops = {
(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
(void (*)(void *, struct modctl *))dtrace_nullop,
- (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+ (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
@@ -427,6 +432,7 @@ dtrace_load##bits(uintptr_t addr) \
#define DTRACE_DYNHASH_SINK 1
#define DTRACE_DYNHASH_VALID 2
+#define DTRACE_MATCH_FAIL -1
#define DTRACE_MATCH_NEXT 0
#define DTRACE_MATCH_DONE 1
#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
@@ -1182,12 +1188,12 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
{
dtrace_dynvar_t *dirty;
dtrace_dstate_percpu_t *dcpu;
- int i, work = 0;
+ dtrace_dynvar_t **rinsep;
+ int i, j, work = 0;
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
-
- ASSERT(dcpu->dtdsc_rinsing == NULL);
+ rinsep = &dcpu->dtdsc_rinsing;
/*
* If the dirty list is NULL, there is no dirty work to do.
@@ -1195,14 +1201,62 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
if (dcpu->dtdsc_dirty == NULL)
continue;
- /*
- * If the clean list is non-NULL, then we're not going to do
- * any work for this CPU -- it means that there has not been
- * a dtrace_dynvar() allocation on this CPU (or from this CPU)
- * since the last time we cleaned house.
- */
- if (dcpu->dtdsc_clean != NULL)
+ if (dcpu->dtdsc_rinsing != NULL) {
+ /*
+ * If the rinsing list is non-NULL, then it is because
+ * this CPU was selected to accept another CPU's
+ * dirty list -- and since that time, dirty buffers
+ * have accumulated. This is a highly unlikely
+ * condition, but we choose to ignore the dirty
+ * buffers -- they'll be picked up a future cleanse.
+ */
continue;
+ }
+
+ if (dcpu->dtdsc_clean != NULL) {
+ /*
+ * If the clean list is non-NULL, then we're in a
+ * situation where a CPU has done deallocations (we
+ * have a non-NULL dirty list) but no allocations (we
+ * also have a non-NULL clean list). We can't simply
+ * move the dirty list into the clean list on this
+ * CPU, yet we also don't want to allow this condition
+ * to persist, lest a short clean list prevent a
+ * massive dirty list from being cleaned (which in
+ * turn could lead to otherwise avoidable dynamic
+ * drops). To deal with this, we look for some CPU
+ * with a NULL clean list, NULL dirty list, and NULL
+ * rinsing list -- and then we borrow this CPU to
+ * rinse our dirty list.
+ */
+ for (j = 0; j < NCPU; j++) {
+ dtrace_dstate_percpu_t *rinser;
+
+ rinser = &dstate->dtds_percpu[j];
+
+ if (rinser->dtdsc_rinsing != NULL)
+ continue;
+
+ if (rinser->dtdsc_dirty != NULL)
+ continue;
+
+ if (rinser->dtdsc_clean != NULL)
+ continue;
+
+ rinsep = &rinser->dtdsc_rinsing;
+ break;
+ }
+
+ if (j == NCPU) {
+ /*
+ * We were unable to find another CPU that
+ * could accept this dirty list -- we are
+ * therefore unable to clean it now.
+ */
+ dtrace_dynvar_failclean++;
+ continue;
+ }
+ }
work = 1;
@@ -1219,7 +1273,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
* on a hash chain, either the dirty list or the
* rinsing list for some CPU must be non-NULL.)
*/
- dcpu->dtdsc_rinsing = dirty;
+ *rinsep = dirty;
dtrace_membar_producer();
} while (dtrace_casptr(&dcpu->dtdsc_dirty,
dirty, NULL) != dirty);
@@ -1650,7 +1704,7 @@ retry:
ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
/*
- * Now we'll move the clean list to the free list.
+ * Now we'll move the clean list to our free list.
* It's impossible for this to fail: the only way
* the free list can be updated is through this
* code path, and only one CPU can own the clean list.
@@ -1663,6 +1717,7 @@ retry:
* owners of the clean lists out before resetting
* the clean lists.
*/
+ dcpu = &dstate->dtds_percpu[me];
rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
ASSERT(rval == NULL);
goto retry;
@@ -3600,7 +3655,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
int64_t index = (int64_t)tupregs[1].dttk_value;
int64_t remaining = (int64_t)tupregs[2].dttk_value;
size_t len = dtrace_strlen((char *)s, size);
- int64_t i = 0;
+ int64_t i;
if (!dtrace_canload(s, len + 1, mstate, vstate)) {
regs[rd] = NULL;
@@ -6655,7 +6710,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
{
dtrace_probe_t template, *probe;
dtrace_hash_t *hash = NULL;
- int len, best = INT_MAX, nmatched = 0;
+ int len, rc, best = INT_MAX, nmatched = 0;
dtrace_id_t i;
ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -6667,7 +6722,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
if (pkp->dtpk_id != DTRACE_IDNONE) {
if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
- (void) (*matched)(probe, arg);
+ if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
+ return (DTRACE_MATCH_FAIL);
nmatched++;
}
return (nmatched);
@@ -6714,8 +6770,12 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
nmatched++;
- if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+ if ((rc = (*matched)(probe, arg)) !=
+ DTRACE_MATCH_NEXT) {
+ if (rc == DTRACE_MATCH_FAIL)
+ return (DTRACE_MATCH_FAIL);
break;
+ }
}
return (nmatched);
@@ -6734,8 +6794,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
nmatched++;
- if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+ if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
+ if (rc == DTRACE_MATCH_FAIL)
+ return (DTRACE_MATCH_FAIL);
break;
+ }
}
return (nmatched);
@@ -6955,7 +7018,7 @@ dtrace_unregister(dtrace_provider_id_t id)
dtrace_probe_t *probe, *first = NULL;
if (old->dtpv_pops.dtps_enable ==
- (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
+ (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
/*
* If DTrace itself is the provider, we're called with locks
* already held.
@@ -7101,7 +7164,7 @@ dtrace_invalidate(dtrace_provider_id_t id)
dtrace_provider_t *pvp = (dtrace_provider_t *)id;
ASSERT(pvp->dtpv_pops.dtps_enable !=
- (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+ (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
@@ -7142,7 +7205,7 @@ dtrace_condense(dtrace_provider_id_t id)
* Make sure this isn't the dtrace provider itself.
*/
ASSERT(prov->dtpv_pops.dtps_enable !=
- (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+ (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
mutex_enter(&dtrace_provider_lock);
mutex_enter(&dtrace_lock);
@@ -8103,7 +8166,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
break;
default:
- err += efunc(dp->dtdo_len - 1, "bad return size");
+ err += efunc(dp->dtdo_len - 1, "bad return size\n");
}
}
@@ -9096,7 +9159,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
return (ecb);
}
-static void
+static int
dtrace_ecb_enable(dtrace_ecb_t *ecb)
{
dtrace_probe_t *probe = ecb->dte_probe;
@@ -9109,7 +9172,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
/*
* This is the NULL probe -- there's nothing to do.
*/
- return;
+ return (0);
}
if (probe->dtpr_ecb == NULL) {
@@ -9123,8 +9186,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
if (ecb->dte_predicate != NULL)
probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
- prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
- probe->dtpr_id, probe->dtpr_arg);
+ return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg));
} else {
/*
* This probe is already active. Swing the last pointer to
@@ -9137,6 +9200,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
probe->dtpr_predcache = 0;
dtrace_sync();
+ return (0);
}
}
@@ -9920,7 +9984,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
return (DTRACE_MATCH_DONE);
- dtrace_ecb_enable(ecb);
+ if (dtrace_ecb_enable(ecb) < 0)
+ return (DTRACE_MATCH_FAIL);
+
return (DTRACE_MATCH_NEXT);
}
@@ -10557,6 +10623,7 @@ dtrace_enabling_destroy(dtrace_enabling_t *enab)
ASSERT(enab->dten_vstate->dtvs_state != NULL);
ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
enab->dten_vstate->dtvs_state->dts_nretained--;
+ dtrace_retained_gen++;
}
if (enab->dten_prev == NULL) {
@@ -10599,6 +10666,7 @@ dtrace_enabling_retain(dtrace_enabling_t *enab)
return (ENOSPC);
state->dts_nretained++;
+ dtrace_retained_gen++;
if (dtrace_retained == NULL) {
dtrace_retained = enab;
@@ -10713,7 +10781,7 @@ static int
dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
{
int i = 0;
- int matched = 0;
+ int total_matched = 0, matched = 0;
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -10724,7 +10792,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
enab->dten_current = ep;
enab->dten_error = 0;
- matched += dtrace_probe_enable(&ep->dted_probe, enab);
+ /*
+ * If a provider failed to enable a probe then get out and
+ * let the consumer know we failed.
+ */
+ if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
+ return (EBUSY);
+
+ total_matched += matched;
if (enab->dten_error != 0) {
/*
@@ -10752,7 +10827,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
enab->dten_probegen = dtrace_probegen;
if (nmatched != NULL)
- *nmatched = matched;
+ *nmatched = total_matched;
return (0);
}
@@ -10766,13 +10841,22 @@ dtrace_enabling_matchall(void)
mutex_enter(&dtrace_lock);
/*
- * Because we can be called after dtrace_detach() has been called, we
- * cannot assert that there are retained enablings. We can safely
- * load from dtrace_retained, however: the taskq_destroy() at the
- * end of dtrace_detach() will block pending our completion.
+ * Iterate over all retained enablings to see if any probes match
+ * against them. We only perform this operation on enablings for which
+ * we have sufficient permissions by virtue of being in the global zone
+ * or in the same zone as the DTrace client. Because we can be called
+ * after dtrace_detach() has been called, we cannot assert that there
+ * are retained enablings. We can safely load from dtrace_retained,
+ * however: the taskq_destroy() at the end of dtrace_detach() will
+ * block pending our completion.
*/
- for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next)
- (void) dtrace_enabling_match(enab, NULL);
+ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
+ cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
+
+ if (INGLOBALZONE(curproc) ||
+ cr != NULL && getzoneid() == crgetzoneid(cr))
+ (void) dtrace_enabling_match(enab, NULL);
+ }
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
@@ -10830,6 +10914,7 @@ dtrace_enabling_provide(dtrace_provider_t *prv)
{
int i, all = 0;
dtrace_probedesc_t desc;
+ dtrace_genid_t gen;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -10840,15 +10925,25 @@ dtrace_enabling_provide(dtrace_provider_t *prv)
}
do {
- dtrace_enabling_t *enab = dtrace_retained;
+ dtrace_enabling_t *enab;
void *parg = prv->dtpv_arg;
- for (; enab != NULL; enab = enab->dten_next) {
+retry:
+ gen = dtrace_retained_gen;
+ for (enab = dtrace_retained; enab != NULL;
+ enab = enab->dten_next) {
for (i = 0; i < enab->dten_ndesc; i++) {
desc = enab->dten_desc[i]->dted_probe;
mutex_exit(&dtrace_lock);
prv->dtpv_pops.dtps_provide(parg, &desc);
mutex_enter(&dtrace_lock);
+ /*
+ * Process the retained enablings again if
+ * they have changed while we weren't holding
+ * dtrace_lock.
+ */
+ if (gen != dtrace_retained_gen)
+ goto retry;
}
}
} while (all && (prv = prv->dtpv_next) != NULL);
@@ -10970,7 +11065,8 @@ dtrace_dof_copyin(uintptr_t uarg, int *errp)
dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
- if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
+ if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
+ dof->dofh_loadsz != hdr.dofh_loadsz) {
kmem_free(dof, hdr.dofh_loadsz);
*errp = EFAULT;
return (NULL);
@@ -11698,6 +11794,13 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
}
}
+ if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
+ !(sec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "loadable section with load "
+ "flag unset");
+ return (-1);
+ }
+
if (!(sec->dofs_flags & DOF_SECF_LOAD))
continue; /* just ignore non-loadable sections */
@@ -14390,7 +14493,8 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
* If this wasn't an open with the "helper" minor, then it must be
* the "dtrace" minor.
*/
- ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
+ if (getminor(*devp) != DTRACEMNRN_DTRACE)
+ return (ENXIO);
/*
* If no DTRACE_PRIV_* bits are set in the credential, then the
@@ -14427,7 +14531,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
mutex_exit(&cpu_lock);
if (state == NULL) {
- if (--dtrace_opens == 0)
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
mutex_exit(&dtrace_lock);
return (EAGAIN);
@@ -14463,7 +14567,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
dtrace_state_destroy(state);
ASSERT(dtrace_opens > 0);
- if (--dtrace_opens == 0)
+
+ /*
+ * Only relinquish control of the kernel debugger interface when there
+ * are no consumers and no anonymous enablings.
+ */
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
mutex_exit(&dtrace_lock);
@@ -15458,7 +15567,8 @@ static struct dev_ops dtrace_ops = {
nodev, /* reset */
&dtrace_cb_ops, /* driver operations */
NULL, /* bus operations */
- nodev /* dev power */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
};
static struct modldrv modldrv = {
diff --git a/uts/common/dtrace/fasttrap.c b/uts/common/dtrace/fasttrap.c
index b7ca92f54a59..42263e4ef274 100644
--- a/uts/common/dtrace/fasttrap.c
+++ b/uts/common/dtrace/fasttrap.c
@@ -20,11 +20,10 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/atomic.h>
#include <sys/errno.h>
@@ -876,7 +875,7 @@ fasttrap_disable_callbacks(void)
}
/*ARGSUSED*/
-static void
+static int
fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
{
fasttrap_probe_t *probe = parg;
@@ -904,7 +903,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
* provider can't go away while we're in this code path.
*/
if (probe->ftp_prov->ftp_retired)
- return;
+ return (0);
/*
* If we can't find the process, it may be that we're in the context of
@@ -913,7 +912,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
*/
if ((p = sprlock(probe->ftp_pid)) == NULL) {
if ((curproc->p_flag & SFORKING) == 0)
- return;
+ return (0);
mutex_enter(&pidlock);
p = prfind(probe->ftp_pid);
@@ -975,7 +974,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
* drop our reference on the trap table entry.
*/
fasttrap_disable_callbacks();
- return;
+ return (0);
}
}
@@ -983,6 +982,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
sprunlock(p);
probe->ftp_enabled = 1;
+ return (0);
}
/*ARGSUSED*/
@@ -1946,7 +1946,8 @@ fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
probe = kmem_alloc(size, KM_SLEEP);
- if (copyin(uprobe, probe, size) != 0) {
+ if (copyin(uprobe, probe, size) != 0 ||
+ probe->ftps_noffs != noffs) {
kmem_free(probe, size);
return (EFAULT);
}
@@ -2044,13 +2045,6 @@ err:
tp->ftt_proc->ftpc_acount != 0)
break;
- /*
- * The count of active providers can only be
- * decremented (i.e. to zero) during exec, exit, and
- * removal of a meta provider so it should be
- * impossible to drop the count during this operation().
- */
- ASSERT(tp->ftt_proc->ftpc_acount != 0);
tp = tp->ftt_next;
}
@@ -2346,7 +2340,8 @@ static struct dev_ops fasttrap_ops = {
nodev, /* reset */
&fasttrap_cb_ops, /* driver operations */
NULL, /* bus operations */
- nodev /* dev power */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
};
/*
diff --git a/uts/common/dtrace/lockstat.c b/uts/common/dtrace/lockstat.c
index 3eb76a061d32..69c8b7254486 100644
--- a/uts/common/dtrace/lockstat.c
+++ b/uts/common/dtrace/lockstat.c
@@ -19,11 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/param.h>
@@ -84,7 +83,7 @@ static kmutex_t lockstat_test; /* for testing purposes only */
static dtrace_provider_id_t lockstat_id;
/*ARGSUSED*/
-static void
+static int
lockstat_enable(void *arg, dtrace_id_t id, void *parg)
{
lockstat_probe_t *probe = parg;
@@ -103,6 +102,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg)
*/
mutex_enter(&lockstat_test);
mutex_exit(&lockstat_test);
+ return (0);
}
/*ARGSUSED*/
@@ -310,11 +310,13 @@ static struct dev_ops lockstat_ops = {
nulldev, /* reset */
&lockstat_cb_ops, /* cb_ops */
NULL, /* bus_ops */
+ NULL, /* power */
+ ddi_quiesce_not_needed, /* quiesce */
};
static struct modldrv modldrv = {
&mod_driverops, /* Type of module. This one is a driver */
- "Lock Statistics %I%", /* name of module */
+ "Lock Statistics", /* name of module */
&lockstat_ops, /* driver ops */
};
diff --git a/uts/common/dtrace/profile.c b/uts/common/dtrace/profile.c
index 8de919a851a2..c1a2d1f1c12f 100644
--- a/uts/common/dtrace/profile.c
+++ b/uts/common/dtrace/profile.c
@@ -19,11 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/errno.h>
#include <sys/stat.h>
@@ -361,7 +360,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg)
}
/*ARGSUSED*/
-static void
+static int
profile_enable(void *arg, dtrace_id_t id, void *parg)
{
profile_probe_t *prof = parg;
@@ -391,6 +390,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg)
} else {
prof->prof_cyclic = cyclic_add_omni(&omni);
}
+ return (0);
}
/*ARGSUSED*/
@@ -539,7 +539,8 @@ static struct dev_ops profile_ops = {
nodev, /* reset */
&profile_cb_ops, /* driver operations */
NULL, /* bus operations */
- nodev /* dev power */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
};
/*
diff --git a/uts/common/dtrace/sdt_subr.c b/uts/common/dtrace/sdt_subr.c
index 66ff8a92a01b..242185071bb2 100644
--- a/uts/common/dtrace/sdt_subr.c
+++ b/uts/common/dtrace/sdt_subr.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/sdt_impl.h>
static dtrace_pattr_t vtrace_attr = {
@@ -43,6 +40,14 @@ static dtrace_pattr_t info_attr = {
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
};
+static dtrace_pattr_t fc_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+};
+
static dtrace_pattr_t fpu_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -83,6 +88,14 @@ static dtrace_pattr_t xpv_attr = {
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
};
+static dtrace_pattr_t iscsi_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+};
+
sdt_provider_t sdt_providers[] = {
{ "vtrace", "__vtrace_", &vtrace_attr, 0 },
{ "sysinfo", "__cpu_sysinfo_", &info_attr, 0 },
@@ -91,11 +104,17 @@ sdt_provider_t sdt_providers[] = {
{ "sched", "__sched_", &stab_attr, 0 },
{ "proc", "__proc_", &stab_attr, 0 },
{ "io", "__io_", &stab_attr, 0 },
+ { "ip", "__ip_", &stab_attr, 0 },
+ { "tcp", "__tcp_", &stab_attr, 0 },
+ { "udp", "__udp_", &stab_attr, 0 },
{ "mib", "__mib_", &stab_attr, 0 },
{ "fsinfo", "__fsinfo_", &fsinfo_attr, 0 },
+ { "iscsi", "__iscsi_", &iscsi_attr, 0 },
{ "nfsv3", "__nfsv3_", &stab_attr, 0 },
{ "nfsv4", "__nfsv4_", &stab_attr, 0 },
{ "xpv", "__xpv_", &xpv_attr, 0 },
+ { "fc", "__fc_", &fc_attr, 0 },
+ { "srp", "__srp_", &fc_attr, 0 },
{ "sysevent", "__sysevent_", &stab_attr, 0 },
{ "sdt", NULL, &sdt_attr, 0 },
{ NULL }
@@ -169,6 +188,73 @@ sdt_argdesc_t sdt_args[] = {
{ "fsinfo", NULL, 0, 0, "vnode_t *", "fileinfo_t *" },
{ "fsinfo", NULL, 1, 1, "int", "int" },
+ { "iscsi", "async-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "async-send", 1, 1, "iscsi_async_evt_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "login-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "login-command", 1, 1, "iscsi_login_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "login-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "login-response", 1, 1, "iscsi_login_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "logout-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "logout-command", 1, 1, "iscsi_logout_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "logout-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "logout-response", 1, 1, "iscsi_logout_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "data-request", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "data-request", 1, 1, "iscsi_rtt_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "data-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "data-send", 1, 1, "iscsi_data_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "data-receive", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "data-receive", 1, 1, "iscsi_data_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "nop-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "nop-send", 1, 1, "iscsi_nop_in_hdr_t *", "iscsiinfo_t *" },
+ { "iscsi", "nop-receive", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "nop-receive", 1, 1, "iscsi_nop_out_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "scsi-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "scsi-command", 1, 1, "iscsi_scsi_cmd_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "scsi-command", 2, 2, "scsi_task_t *", "scsicmd_t *" },
+ { "iscsi", "scsi-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "scsi-response", 1, 1, "iscsi_scsi_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "task-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "task-command", 1, 1, "iscsi_scsi_task_mgt_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "task-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "task-response", 1, 1, "iscsi_scsi_task_mgt_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "text-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "text-command", 1, 1, "iscsi_text_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "text-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "text-response", 1, 1, "iscsi_text_rsp_hdr_t *",
+ "iscsiinfo_t *" },
+ { "iscsi", "xfer-start", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "xfer-start", 1, 0, "idm_conn_t *", "iscsiinfo_t *" },
+ { "iscsi", "xfer-start", 2, 1, "uintptr_t", "xferinfo_t *" },
+ { "iscsi", "xfer-start", 3, 2, "uint32_t"},
+ { "iscsi", "xfer-start", 4, 3, "uintptr_t"},
+ { "iscsi", "xfer-start", 5, 4, "uint32_t"},
+ { "iscsi", "xfer-start", 6, 5, "uint32_t"},
+ { "iscsi", "xfer-start", 7, 6, "uint32_t"},
+ { "iscsi", "xfer-start", 8, 7, "int"},
+ { "iscsi", "xfer-done", 0, 0, "idm_conn_t *", "conninfo_t *" },
+ { "iscsi", "xfer-done", 1, 0, "idm_conn_t *", "iscsiinfo_t *" },
+ { "iscsi", "xfer-done", 2, 1, "uintptr_t", "xferinfo_t *" },
+ { "iscsi", "xfer-done", 3, 2, "uint32_t"},
+ { "iscsi", "xfer-done", 4, 3, "uintptr_t"},
+ { "iscsi", "xfer-done", 5, 4, "uint32_t"},
+ { "iscsi", "xfer-done", 6, 5, "uint32_t"},
+ { "iscsi", "xfer-done", 7, 6, "uint32_t"},
+ { "iscsi", "xfer-done", 8, 7, "int"},
+
{ "nfsv3", "op-getattr-start", 0, 0, "struct svc_req *",
"conninfo_t *" },
{ "nfsv3", "op-getattr-start", 1, 1, "nfsv3oparg_t *",
@@ -788,6 +874,75 @@ sdt_argdesc_t sdt_args[] = {
"nfsv4cbinfo_t *" },
{ "nfsv4", "cb-recall-done", 2, 2, "CB_RECALL4res *" },
+ { "ip", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "ip", "send", 1, 1, "conn_t *", "csinfo_t *" },
+ { "ip", "send", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "ip", "send", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" },
+ { "ip", "send", 4, 4, "ipha_t *", "ipv4info_t *" },
+ { "ip", "send", 5, 5, "ip6_t *", "ipv6info_t *" },
+ { "ip", "send", 6, 6, "int" }, /* used by __dtrace_ipsr_ill_t */
+ { "ip", "receive", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "ip", "receive", 1, 1, "conn_t *", "csinfo_t *" },
+ { "ip", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "ip", "receive", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" },
+ { "ip", "receive", 4, 4, "ipha_t *", "ipv4info_t *" },
+ { "ip", "receive", 5, 5, "ip6_t *", "ipv6info_t *" },
+ { "ip", "receive", 6, 6, "int" }, /* used by __dtrace_ipsr_ill_t */
+
+ { "tcp", "connect-established", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "connect-established", 1, 1, "ip_xmit_attr_t *",
+ "csinfo_t *" },
+ { "tcp", "connect-established", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "tcp", "connect-established", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "connect-established", 4, 4, "tcph_t *", "tcpinfo_t *" },
+ { "tcp", "connect-refused", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "connect-refused", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "connect-refused", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "tcp", "connect-refused", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "connect-refused", 4, 4, "tcph_t *", "tcpinfo_t *" },
+ { "tcp", "connect-request", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "connect-request", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "connect-request", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "tcp", "connect-request", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "connect-request", 4, 4, "tcph_t *", "tcpinfo_t *" },
+ { "tcp", "accept-established", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "accept-established", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "accept-established", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "tcp", "accept-established", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "accept-established", 4, 4, "tcph_t *", "tcpinfo_t *" },
+ { "tcp", "accept-refused", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "accept-refused", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "accept-refused", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "tcp", "accept-refused", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "accept-refused", 4, 4, "tcph_t *", "tcpinfo_t *" },
+ { "tcp", "state-change", 0, 0, "void", "void" },
+ { "tcp", "state-change", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "state-change", 2, 2, "void", "void" },
+ { "tcp", "state-change", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "state-change", 4, 4, "void", "void" },
+ { "tcp", "state-change", 5, 5, "int32_t", "tcplsinfo_t *" },
+ { "tcp", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "send", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "send", 2, 2, "__dtrace_tcp_void_ip_t *", "ipinfo_t *" },
+ { "tcp", "send", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "send", 4, 4, "__dtrace_tcp_tcph_t *", "tcpinfo_t *" },
+ { "tcp", "receive", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "tcp", "receive", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "tcp", "receive", 2, 2, "__dtrace_tcp_void_ip_t *", "ipinfo_t *" },
+ { "tcp", "receive", 3, 3, "tcp_t *", "tcpsinfo_t *" },
+ { "tcp", "receive", 4, 4, "__dtrace_tcp_tcph_t *", "tcpinfo_t *" },
+
+ { "udp", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "udp", "send", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "udp", "send", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "udp", "send", 3, 3, "udp_t *", "udpsinfo_t *" },
+ { "udp", "send", 4, 4, "udpha_t *", "udpinfo_t *" },
+ { "udp", "receive", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "udp", "receive", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" },
+ { "udp", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" },
+ { "udp", "receive", 3, 3, "udp_t *", "udpsinfo_t *" },
+ { "udp", "receive", 4, 4, "udpha_t *", "udpinfo_t *" },
+
{ "sysevent", "post", 0, 0, "evch_bind_t *", "syseventchaninfo_t *" },
{ "sysevent", "post", 1, 1, "sysevent_impl_t *", "syseventinfo_t *" },
@@ -848,6 +1003,154 @@ sdt_argdesc_t sdt_args[] = {
{ "xpv", "setvcpucontext-end", 0, 0, "int" },
{ "xpv", "setvcpucontext-start", 0, 0, "domid_t" },
{ "xpv", "setvcpucontext-start", 1, 1, "vcpu_guest_context_t *" },
+
+ { "srp", "service-up", 0, 0, "srpt_session_t *", "conninfo_t *" },
+ { "srp", "service-up", 1, 0, "srpt_session_t *", "srp_portinfo_t *" },
+ { "srp", "service-down", 0, 0, "srpt_session_t *", "conninfo_t *" },
+ { "srp", "service-down", 1, 0, "srpt_session_t *",
+ "srp_portinfo_t *" },
+ { "srp", "login-command", 0, 0, "srpt_session_t *", "conninfo_t *" },
+ { "srp", "login-command", 1, 0, "srpt_session_t *",
+ "srp_portinfo_t *" },
+ { "srp", "login-command", 2, 1, "srp_login_req_t *",
+ "srp_logininfo_t *" },
+ { "srp", "login-response", 0, 0, "srpt_session_t *", "conninfo_t *" },
+ { "srp", "login-response", 1, 0, "srpt_session_t *",
+ "srp_portinfo_t *" },
+ { "srp", "login-response", 2, 1, "srp_login_rsp_t *",
+ "srp_logininfo_t *" },
+ { "srp", "login-response", 3, 2, "srp_login_rej_t *" },
+ { "srp", "logout-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "logout-command", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "task-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "task-command", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "task-command", 2, 1, "srp_cmd_req_t *", "srp_taskinfo_t *" },
+ { "srp", "task-response", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "task-response", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "task-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" },
+ { "srp", "task-response", 3, 2, "scsi_task_t *" },
+ { "srp", "task-response", 4, 3, "int8_t" },
+ { "srp", "scsi-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "scsi-command", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "scsi-command", 2, 1, "scsi_task_t *", "scsicmd_t *" },
+ { "srp", "scsi-command", 3, 2, "srp_cmd_req_t *", "srp_taskinfo_t *" },
+ { "srp", "scsi-response", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "scsi-response", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "scsi-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" },
+ { "srp", "scsi-response", 3, 2, "scsi_task_t *" },
+ { "srp", "scsi-response", 4, 3, "int8_t" },
+ { "srp", "xfer-start", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "xfer-start", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "xfer-start", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" },
+ { "srp", "xfer-start", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" },
+ { "srp", "xfer-start", 4, 3, "ibt_send_wr_t *"},
+ { "srp", "xfer-start", 5, 4, "uint32_t" },
+ { "srp", "xfer-start", 6, 5, "uint32_t" },
+ { "srp", "xfer-start", 7, 6, "uint32_t" },
+ { "srp", "xfer-start", 8, 7, "uint32_t" },
+ { "srp", "xfer-done", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+ { "srp", "xfer-done", 1, 0, "srpt_channel_t *",
+ "srp_portinfo_t *" },
+ { "srp", "xfer-done", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" },
+ { "srp", "xfer-done", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" },
+ { "srp", "xfer-done", 4, 3, "ibt_send_wr_t *"},
+ { "srp", "xfer-done", 5, 4, "uint32_t" },
+ { "srp", "xfer-done", 6, 5, "uint32_t" },
+ { "srp", "xfer-done", 7, 6, "uint32_t" },
+ { "srp", "xfer-done", 8, 7, "uint32_t" },
+
+ { "fc", "link-up", 0, 0, "fct_i_local_port_t *", "conninfo_t *" },
+ { "fc", "link-down", 0, 0, "fct_i_local_port_t *", "conninfo_t *" },
+ { "fc", "fabric-login-start", 0, 0, "fct_i_local_port_t *",
+ "conninfo_t *" },
+ { "fc", "fabric-login-start", 1, 0, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "fabric-login-end", 0, 0, "fct_i_local_port_t *",
+ "conninfo_t *" },
+ { "fc", "fabric-login-end", 1, 0, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-login-start", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "rport-login-start", 1, 1, "fct_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-login-start", 2, 2, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-login-start", 3, 3, "int", "int" },
+ { "fc", "rport-login-end", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "rport-login-end", 1, 1, "fct_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-login-end", 2, 2, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-login-end", 3, 3, "int", "int" },
+ { "fc", "rport-login-end", 4, 4, "int", "int" },
+ { "fc", "rport-logout-start", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "rport-logout-start", 1, 1, "fct_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-logout-start", 2, 2, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-logout-start", 3, 3, "int", "int" },
+ { "fc", "rport-logout-end", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "rport-logout-end", 1, 1, "fct_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-logout-end", 2, 2, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "rport-logout-end", 3, 3, "int", "int" },
+ { "fc", "scsi-command", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "scsi-command", 1, 1, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "scsi-command", 2, 2, "scsi_task_t *",
+ "scsicmd_t *" },
+ { "fc", "scsi-command", 3, 3, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "scsi-response", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "scsi-response", 1, 1, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "scsi-response", 2, 2, "scsi_task_t *",
+ "scsicmd_t *" },
+ { "fc", "scsi-response", 3, 3, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "xfer-start", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "xfer-start", 1, 1, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "xfer-start", 2, 2, "scsi_task_t *",
+ "scsicmd_t *" },
+ { "fc", "xfer-start", 3, 3, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "xfer-start", 4, 4, "stmf_data_buf_t *",
+ "fc_xferinfo_t *" },
+ { "fc", "xfer-done", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "xfer-done", 1, 1, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "xfer-done", 2, 2, "scsi_task_t *",
+ "scsicmd_t *" },
+ { "fc", "xfer-done", 3, 3, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "xfer-done", 4, 4, "stmf_data_buf_t *",
+ "fc_xferinfo_t *" },
+ { "fc", "rscn-receive", 0, 0, "fct_i_local_port_t *",
+ "conninfo_t *" },
+ { "fc", "rscn-receive", 1, 1, "int", "int"},
+ { "fc", "abts-receive", 0, 0, "fct_cmd_t *",
+ "conninfo_t *" },
+ { "fc", "abts-receive", 1, 1, "fct_i_local_port_t *",
+ "fc_port_info_t *" },
+ { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *",
+ "fc_port_info_t *" },
+
+
{ NULL }
};
diff --git a/uts/common/dtrace/systrace.c b/uts/common/dtrace/systrace.c
index be14660b04c0..b864041c450d 100644
--- a/uts/common/dtrace/systrace.c
+++ b/uts/common/dtrace/systrace.c
@@ -19,11 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/dtrace.h>
#include <sys/systrace.h>
@@ -141,7 +140,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg)
}
/*ARGSUSED*/
-static void
+static int
systrace_enable(void *arg, dtrace_id_t id, void *parg)
{
int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
@@ -162,7 +161,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg)
if (enabled) {
ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
- return;
+ return (0);
}
(void) casptr(&sysent[sysnum].sy_callc,
@@ -173,6 +172,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg)
(void *)systrace_sysent32[sysnum].stsy_underlying,
(void *)dtrace_systrace_syscall32);
#endif
+ return (0);
}
/*ARGSUSED*/
@@ -336,7 +336,8 @@ static struct dev_ops systrace_ops = {
nodev, /* reset */
&systrace_cb_ops, /* driver operations */
NULL, /* bus operations */
- nodev /* dev power */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
};
/*
diff --git a/uts/common/fs/gfs.c b/uts/common/fs/gfs.c
new file mode 100644
index 000000000000..4d24df60f75b
--- /dev/null
+++ b/uts/common/fs/gfs.c
@@ -0,0 +1,1178 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2007 Shivakumar GN */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/kmem.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/sunddi.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+
+#include <sys/gfs.h>
+
+/*
+ * Generic pseudo-filesystem routines.
+ *
+ * There are significant similarities between the implementation of certain file
+ * system entry points across different filesystems. While one could attempt to
+ * "choke up on the bat" and incorporate common functionality into a VOP
+ * preamble or postamble, such an approach is limited in the benefit it can
+ * provide. In this file we instead define a toolkit of routines which can be
+ * called from a filesystem (with in-kernel pseudo-filesystems being the focus
+ * of the exercise) in a more component-like fashion.
+ *
+ * There are three basic classes of routines:
+ *
+ * 1) Lowlevel support routines
+ *
+ * These routines are designed to play a support role for existing
+ * pseudo-filesystems (such as procfs). They simplify common tasks,
+ * without forcing the filesystem to hand over management to GFS. The
+ * routines covered are:
+ *
+ * gfs_readdir_init()
+ * gfs_readdir_emit()
+ * gfs_readdir_emitn()
+ * gfs_readdir_pred()
+ * gfs_readdir_fini()
+ * gfs_lookup_dot()
+ *
+ * 2) Complete GFS management
+ *
+ * These routines take a more active role in management of the
+ * pseudo-filesystem. They handle the relationship between vnode private
+ * data and VFS data, as well as the relationship between vnodes in the
+ * directory hierarchy.
+ *
+ * In order to use these interfaces, the first member of every private
+ * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control
+ * to GFS.
+ *
+ * gfs_file_create()
+ * gfs_dir_create()
+ * gfs_root_create()
+ *
+ * gfs_file_inactive()
+ * gfs_dir_inactive()
+ * gfs_dir_lookup()
+ * gfs_dir_readdir()
+ *
+ * gfs_vop_inactive()
+ * gfs_vop_lookup()
+ * gfs_vop_readdir()
+ * gfs_vop_map()
+ *
+ * 3) Single File pseudo-filesystems
+ *
+ * This routine creates a rooted file to be overlayed ontop of another
+ * file in the physical filespace.
+ *
+ * Note that the parent is NULL (actually the vfs), but there is nothing
+ * technically keeping such a file from utilizing the "Complete GFS
+ * management" set of routines.
+ *
+ * gfs_root_create_file()
+ */
+
+/*
+ * gfs_make_opsvec: take an array of vnode type definitions and create
+ * their vnodeops_t structures
+ *
+ * This routine takes an array of gfs_opsvec_t's. It could
+ * alternatively take an array of gfs_opsvec_t*'s, which would allow
+ * vnode types to be completely defined in files external to the caller
+ * of gfs_make_opsvec(). As it stands, much more sharing takes place --
+ * both the caller and the vnode type provider need to access gfsv_ops
+ * and gfsv_template, and the caller also needs to know gfsv_name.
+ */
+int
+gfs_make_opsvec(gfs_opsvec_t *vec)
+{
+ int error, i;
+
+ for (i = 0; ; i++) {
+ if (vec[i].gfsv_name == NULL)
+ return (0);
+ error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
+ vec[i].gfsv_ops);
+ if (error)
+ break;
+ }
+
+ cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
+ vec[i].gfsv_name);
+ for (i--; i >= 0; i--) {
+ vn_freevnodeops(*vec[i].gfsv_ops);
+ *vec[i].gfsv_ops = NULL;
+ }
+ return (error);
+}
+
+/*
+ * Low level directory routines
+ *
+ * These routines provide some simple abstractions for reading directories.
+ * They are designed to be used by existing pseudo filesystems (namely procfs)
+ * that already have a complicated management infrastructure.
+ */
+
+/*
+ * gfs_get_parent_ino: used to obtain a parent inode number and the
+ * inode number of the given vnode in preparation for calling gfs_readdir_init.
+ */
+int
+gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ ino64_t *pino, ino64_t *ino)
+{
+ vnode_t *parent;
+ gfs_dir_t *dp = dvp->v_data;
+ int error;
+
+ *ino = dp->gfsd_file.gfs_ino;
+ parent = dp->gfsd_file.gfs_parent;
+
+ if (parent == NULL) {
+ *pino = *ino; /* root of filesystem */
+ } else if (dvp->v_flag & V_XATTRDIR) {
+ vattr_t va;
+
+ va.va_mask = AT_NODEID;
+ error = VOP_GETATTR(parent, &va, 0, cr, ct);
+ if (error)
+ return (error);
+ *pino = va.va_nodeid;
+ } else {
+ *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
+ }
+
+ return (0);
+}
+
+/*
+ * gfs_readdir_init: initiate a generic readdir
+ * st - a pointer to an uninitialized gfs_readdir_state_t structure
+ * name_max - the directory's maximum file name length
+ * ureclen - the exported file-space record length (1 for non-legacy FSs)
+ * uiop - the uiop passed to readdir
+ * parent - the parent directory's inode
+ * self - this directory's inode
+ * flags - flags from VOP_READDIR
+ *
+ * Returns 0 or a non-zero errno.
+ *
+ * Typical VOP_READDIR usage of gfs_readdir_*:
+ *
+ * if ((error = gfs_readdir_init(...)) != 0)
+ * return (error);
+ * eof = 0;
+ * while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
+ * if (!consumer_entry_at(voffset))
+ * voffset = consumer_next_entry(voffset);
+ * if (consumer_eof(voffset)) {
+ * eof = 1
+ * break;
+ * }
+ * if ((error = gfs_readdir_emit(..., voffset,
+ * consumer_ino(voffset), consumer_name(voffset))) != 0)
+ * break;
+ * }
+ * return (gfs_readdir_fini(..., error, eofp, eof));
+ *
+ * As you can see, a zero result from gfs_readdir_pred() or
+ * gfs_readdir_emit() indicates that processing should continue,
+ * whereas a non-zero result indicates that the loop should terminate.
+ * Most consumers need do nothing more than let gfs_readdir_fini()
+ * determine what the cause of failure was and return the appropriate
+ * value.
+ */
+int
+gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
+ uio_t *uiop, ino64_t parent, ino64_t self, int flags)
+{
+ size_t dirent_size;
+
+ if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
+ (uiop->uio_loffset % ureclen) != 0)
+ return (EINVAL);
+
+ st->grd_ureclen = ureclen;
+ st->grd_oresid = uiop->uio_resid;
+ st->grd_namlen = name_max;
+ if (flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
+ st->grd_parent = parent;
+ st->grd_self = self;
+ st->grd_flags = flags;
+
+ return (0);
+}
+
+/*
+ * gfs_readdir_emit_int: internal routine to emit directory entry
+ *
+ * st - the current readdir state, which must have d_ino/ed_ino
+ * and d_name/ed_name set
+ * uiop - caller-supplied uio pointer
+ * next - the offset of the next entry
+ */
+static int
+gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
+{
+ int reclen;
+ dirent64_t *dp;
+ edirent_t *edp;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp = st->grd_dirent;
+ reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
+ } else {
+ dp = st->grd_dirent;
+ reclen = DIRENT64_RECLEN(strlen(dp->d_name));
+ }
+
+ if (reclen > uiop->uio_resid) {
+ /*
+ * Error if no entries were returned yet
+ */
+ if (uiop->uio_resid == st->grd_oresid)
+ return (EINVAL);
+ return (-1);
+ }
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edp->ed_off = next;
+ edp->ed_reclen = (ushort_t)reclen;
+ } else {
+ dp->d_off = next;
+ dp->d_reclen = (ushort_t)reclen;
+ }
+
+ if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
+ return (EFAULT);
+
+ uiop->uio_loffset = next;
+
+ return (0);
+}
+
+/*
+ * gfs_readdir_emit: emit a directory entry
+ * voff - the virtual offset (obtained from gfs_readdir_pred)
+ * ino - the entry's inode
+ * name - the entry's name
+ * eflags - value for ed_eflags (if processing edirent_t)
+ *
+ * Returns a 0 on success, a non-zero errno on failure, or -1 if the
+ * readdir loop should terminate. A non-zero result (either errno or
+ * -1) from this function is typically passed directly to
+ * gfs_readdir_fini().
+ */
+int
+gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
+ ino64_t ino, const char *name, int eflags)
+{
+ offset_t off = (voff + 2) * st->grd_ureclen;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+ edirent_t *edp = st->grd_dirent;
+
+ edp->ed_ino = ino;
+ (void) strncpy(edp->ed_name, name, st->grd_namlen);
+ edp->ed_eflags = eflags;
+ } else {
+ dirent64_t *dp = st->grd_dirent;
+
+ dp->d_ino = ino;
+ (void) strncpy(dp->d_name, name, st->grd_namlen);
+ }
+
+ /*
+ * Inter-entry offsets are invalid, so we assume a record size of
+ * grd_ureclen and explicitly set the offset appropriately.
+ */
+ return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
+}
+
+/*
+ * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
+ * instead of a string for the entry's name.
+ */
+int
+gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
+ ino64_t ino, unsigned long num)
+{
+ char buf[40];
+
+ numtos(num, buf);
+ return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
+}
+
+/*
+ * gfs_readdir_pred: readdir loop predicate
+ * voffp - a pointer in which the next virtual offset should be stored
+ *
+ * Returns a 0 on success, a non-zero errno on failure, or -1 if the
+ * readdir loop should terminate. A non-zero result (either errno or
+ * -1) from this function is typically passed directly to
+ * gfs_readdir_fini().
+ */
+int
+gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
+{
+ offset_t off, voff;
+ int error;
+
+top:
+ if (uiop->uio_resid <= 0)
+ return (-1);
+
+ off = uiop->uio_loffset / st->grd_ureclen;
+ voff = off - 2;
+ if (off == 0) {
+ if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
+ ".", 0)) == 0)
+ goto top;
+ } else if (off == 1) {
+ if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
+ "..", 0)) == 0)
+ goto top;
+ } else {
+ *voffp = voff;
+ return (0);
+ }
+
+ return (error);
+}
+
+/*
+ * gfs_readdir_fini: generic readdir cleanup
+ * error - if positive, an error to return
+ * eofp - the eofp passed to readdir
+ * eof - the eof value
+ *
+ * Returns a 0 on success, a non-zero errno on failure. This result
+ * should be returned from readdir.
+ */
+int
+gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
+{
+ size_t dirent_size;
+
+ if (st->grd_flags & V_RDDIR_ENTFLAGS)
+ dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+ else
+ dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+ kmem_free(st->grd_dirent, dirent_size);
+ if (error > 0)
+ return (error);
+ if (eofp)
+ *eofp = eof;
+ return (0);
+}
+
+/*
+ * gfs_lookup_dot
+ *
+ * Performs a basic check for "." and ".." directory entries.
+ */
+int
+gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
+{
+ if (*nm == '\0' || strcmp(nm, ".") == 0) {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ } else if (strcmp(nm, "..") == 0) {
+ if (pvp == NULL) {
+ ASSERT(dvp->v_flag & VROOT);
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ } else {
+ VN_HOLD(pvp);
+ *vpp = pvp;
+ }
+ return (0);
+ }
+
+ return (-1);
+}
+
+/*
+ * gfs_file_create(): create a new GFS file
+ *
+ * size - size of private data structure (v_data)
+ * pvp - parent vnode (GFS directory)
+ * ops - vnode operations vector
+ *
+ * In order to use this interface, the parent vnode must have been created by
+ * gfs_dir_create(), and the private data stored in v_data must have a
+ * 'gfs_file_t' as its first field.
+ *
+ * Given these constraints, this routine will automatically:
+ *
+ * - Allocate v_data for the vnode
+ * - Initialize necessary fields in the vnode
+ * - Hold the parent
+ */
+vnode_t *
+gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
+{
+ gfs_file_t *fp;
+ vnode_t *vp;
+
+ /*
+ * Allocate vnode and internal data structure
+ */
+ fp = kmem_zalloc(size, KM_SLEEP);
+ vp = vn_alloc(KM_SLEEP);
+
+ /*
+ * Set up various pointers
+ */
+ fp->gfs_vnode = vp;
+ fp->gfs_parent = pvp;
+ vp->v_data = fp;
+ fp->gfs_size = size;
+ fp->gfs_type = GFS_FILE;
+
+ /*
+ * Initialize vnode and hold parent.
+ */
+ vn_setops(vp, ops);
+ if (pvp) {
+ VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
+ VN_HOLD(pvp);
+ }
+
+ return (vp);
+}
+
+/*
+ * gfs_dir_create: creates a new directory in the parent
+ *
+ * size - size of private data structure (v_data)
+ * pvp - parent vnode (GFS directory)
+ * ops - vnode operations vector
+ * entries - NULL-terminated list of static entries (if any)
+ * maxlen - maximum length of a directory entry
+ * readdir_cb - readdir callback (see gfs_dir_readdir)
+ * inode_cb - inode callback (see gfs_dir_readdir)
+ * lookup_cb - lookup callback (see gfs_dir_lookup)
+ *
+ * In order to use this function, the first member of the private vnode
+ * structure (v_data) must be a gfs_dir_t. For each directory, there are
+ * static entries, defined when the structure is initialized, and dynamic
+ * entries, retrieved through callbacks.
+ *
+ * If a directory has static entries, then it must supply a inode callback,
+ * which will compute the inode number based on the parent and the index.
+ * For a directory with dynamic entries, the caller must supply a readdir
+ * callback and a lookup callback. If a static lookup fails, we fall back to
+ * the supplied lookup callback, if any.
+ *
+ * This function also performs the same initialization as gfs_file_create().
+ */
+vnode_t *
+gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
+ gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
+ gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
+{
+ vnode_t *vp;
+ gfs_dir_t *dp;
+ gfs_dirent_t *de;
+
+ vp = gfs_file_create(struct_size, pvp, ops);
+ vp->v_type = VDIR;
+
+ dp = vp->v_data;
+ dp->gfsd_file.gfs_type = GFS_DIR;
+ dp->gfsd_maxlen = maxlen;
+
+ if (entries != NULL) {
+ for (de = entries; de->gfse_name != NULL; de++)
+ dp->gfsd_nstatic++;
+
+ dp->gfsd_static = kmem_alloc(
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
+ bcopy(entries, dp->gfsd_static,
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t));
+ }
+
+ dp->gfsd_readdir = readdir_cb;
+ dp->gfsd_lookup = lookup_cb;
+ dp->gfsd_inode = inode_cb;
+
+ mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (vp);
+}
+
+/*
+ * gfs_root_create(): create a root vnode for a GFS filesystem
+ *
+ * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The
+ * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
+ */
+vnode_t *
+gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
+ gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
+ gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
+{
+ vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
+ maxlen, readdir_cb, lookup_cb);
+
+ /* Manually set the inode */
+ ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
+
+ VFS_HOLD(vfsp);
+ VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
+ vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
+
+ return (vp);
+}
+
+/*
+ * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
+ *
+ * Similar to gfs_root_create(), this creates a root vnode for a file to
+ * be the pseudo-filesystem.
+ */
+vnode_t *
+gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
+{
+ vnode_t *vp = gfs_file_create(size, NULL, ops);
+
+ ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
+
+ VFS_HOLD(vfsp);
+ VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
+ vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
+
+ return (vp);
+}
+
+/*
+ * gfs_file_inactive()
+ *
+ * Called from the VOP_INACTIVE() routine. If necessary, this routine will
+ * remove the given vnode from the parent directory and clean up any references
+ * in the VFS layer.
+ *
+ * If the vnode was not removed (due to a race with vget), then NULL is
+ * returned. Otherwise, a pointer to the private data is returned.
+ */
+void *
+gfs_file_inactive(vnode_t *vp)
+{
+ int i;
+ gfs_dirent_t *ge = NULL;
+ gfs_file_t *fp = vp->v_data;
+ gfs_dir_t *dp = NULL;
+ void *data;
+
+ if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
+ goto found;
+
+ dp = fp->gfs_parent->v_data;
+
+ /*
+ * First, see if this vnode is cached in the parent.
+ */
+ gfs_dir_lock(dp);
+
+ /*
+ * Find it in the set of static entries.
+ */
+ for (i = 0; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (ge->gfse_vnode == vp)
+ goto found;
+ }
+
+ /*
+ * If 'ge' is NULL, then it is a dynamic entry.
+ */
+ ge = NULL;
+
+found:
+ if (vp->v_flag & V_XATTRDIR) {
+ mutex_enter(&fp->gfs_parent->v_lock);
+ }
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count == 1) {
+ /*
+ * Really remove this vnode
+ */
+ data = vp->v_data;
+ if (ge != NULL) {
+ /*
+ * If this was a statically cached entry, simply set the
+ * cached vnode to NULL.
+ */
+ ge->gfse_vnode = NULL;
+ }
+ if (vp->v_flag & V_XATTRDIR) {
+ fp->gfs_parent->v_xattrdir = NULL;
+ mutex_exit(&fp->gfs_parent->v_lock);
+ }
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * Free vnode and release parent
+ */
+ if (fp->gfs_parent) {
+ if (dp) {
+ gfs_dir_unlock(dp);
+ }
+ VN_RELE(fp->gfs_parent);
+ } else {
+ ASSERT(vp->v_vfsp != NULL);
+ VFS_RELE(vp->v_vfsp);
+ }
+ vn_free(vp);
+ } else {
+ vp->v_count--;
+ data = NULL;
+ mutex_exit(&vp->v_lock);
+ if (vp->v_flag & V_XATTRDIR) {
+ mutex_exit(&fp->gfs_parent->v_lock);
+ }
+ if (dp)
+ gfs_dir_unlock(dp);
+ }
+
+ return (data);
+}
+
+/*
+ * gfs_dir_inactive()
+ *
+ * Same as above, but for directories.
+ */
+void *
+gfs_dir_inactive(vnode_t *vp)
+{
+ gfs_dir_t *dp;
+
+ ASSERT(vp->v_type == VDIR);
+
+ if ((dp = gfs_file_inactive(vp)) != NULL) {
+ mutex_destroy(&dp->gfsd_lock);
+ if (dp->gfsd_nstatic)
+ kmem_free(dp->gfsd_static,
+ dp->gfsd_nstatic * sizeof (gfs_dirent_t));
+ }
+
+ return (dp);
+}
+
+/*
+ * gfs_dir_lookup_dynamic()
+ *
+ * This routine looks up the provided name amongst the dynamic entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ *
+ * The gfs directory is expected to be locked by the caller prior to
+ * calling this function. The directory will be unlocked during the
+ * execution of this function, but will be locked upon return from the
+ * function. This function returns 0 on success, non-zero on error.
+ *
+ * The dynamic lookups are performed by invoking the lookup
+ * callback, which is passed to this function as the first argument.
+ * The arguments to the callback are:
+ *
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ * int flags, int *deflgs, pathname_t *rpnp);
+ *
+ * pvp - parent vnode
+ * nm - name of entry
+ * vpp - pointer to resulting vnode
+ * cr - pointer to cred
+ * flags - flags value from lookup request
+ * ignored here; currently only used to request
+ * insensitive lookups
+ * direntflgs - output parameter, directory entry flags
+ * ignored here; currently only used to indicate a lookup
+ * has more than one possible match when case is not considered
+ * realpnp - output parameter, real pathname
+ * ignored here; when lookup was performed case-insensitively,
+ * this field contains the "real" name of the file.
+ *
+ * Returns 0 on success, non-zero on error.
+ */
+static int
+gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
+ const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
+ int *direntflags, pathname_t *realpnp)
+{
+ gfs_file_t *fp;
+ ino64_t ino;
+ int ret;
+
+ ASSERT(GFS_DIR_LOCKED(dp));
+
+ /*
+ * Drop the directory lock, as the lookup routine
+ * will need to allocate memory, or otherwise deadlock on this
+ * directory.
+ */
+ gfs_dir_unlock(dp);
+ ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
+ gfs_dir_lock(dp);
+
+ /*
+ * The callback for extended attributes returns a vnode
+ * with v_data from an underlying fs.
+ */
+ if (ret == 0 && !IS_XATTRDIR(dvp)) {
+ fp = (gfs_file_t *)((*vpp)->v_data);
+ fp->gfs_index = -1;
+ fp->gfs_ino = ino;
+ }
+
+ return (ret);
+}
+
+/*
+ * gfs_dir_lookup_static()
+ *
+ * This routine looks up the provided name amongst the static entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ * The first argument to the function is a pointer to the comparison
+ * function this function should use to decide if names are a match.
+ *
+ * If a match is found, and GFS_CACHE_VNODE is set and the vnode
+ * exists, we simply return the existing vnode. Otherwise, we call
+ * the static entry's callback routine, caching the result if
+ * necessary. If the idx pointer argument is non-NULL, we use it to
+ * return the index of the matching static entry.
+ *
+ * The gfs directory is expected to be locked by the caller prior to calling
+ * this function. The directory may be unlocked during the execution of
+ * this function, but will be locked upon return from the function.
+ *
+ * This function returns 0 if a match is found, ENOENT if not.
+ */
+static int
+gfs_dir_lookup_static(int (*compare)(const char *, const char *),
+ gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
+ vnode_t **vpp, pathname_t *rpnp)
+{
+ gfs_dirent_t *ge;
+ vnode_t *vp = NULL;
+ int i;
+
+ ASSERT(GFS_DIR_LOCKED(dp));
+
+ /*
+ * Search static entries.
+ */
+ for (i = 0; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (compare(ge->gfse_name, nm) == 0) {
+ if (rpnp)
+ (void) strlcpy(rpnp->pn_buf, ge->gfse_name,
+ rpnp->pn_bufsize);
+
+ if (ge->gfse_vnode) {
+ ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
+ vp = ge->gfse_vnode;
+ VN_HOLD(vp);
+ break;
+ }
+
+ /*
+ * We drop the directory lock, as the constructor will
+ * need to do KM_SLEEP allocations. If we return from
+ * the constructor only to find that a parallel
+ * operation has completed, and GFS_CACHE_VNODE is set
+ * for this entry, we discard the result in favor of
+ * the cached vnode.
+ */
+ gfs_dir_unlock(dp);
+ vp = ge->gfse_ctor(dvp);
+ gfs_dir_lock(dp);
+
+ ((gfs_file_t *)vp->v_data)->gfs_index = i;
+
+ /* Set the inode according to the callback. */
+ ((gfs_file_t *)vp->v_data)->gfs_ino =
+ dp->gfsd_inode(dvp, i);
+
+ if (ge->gfse_flags & GFS_CACHE_VNODE) {
+ if (ge->gfse_vnode == NULL) {
+ ge->gfse_vnode = vp;
+ } else {
+ /*
+ * A parallel constructor beat us to it;
+ * return existing vnode. We have to be
+ * careful because we can't release the
+ * current vnode while holding the
+ * directory lock; its inactive routine
+ * will try to lock this directory.
+ */
+ vnode_t *oldvp = vp;
+ vp = ge->gfse_vnode;
+ VN_HOLD(vp);
+
+ gfs_dir_unlock(dp);
+ VN_RELE(oldvp);
+ gfs_dir_lock(dp);
+ }
+ }
+ break;
+ }
+ }
+
+ if (vp == NULL)
+ return (ENOENT);
+ else if (idx)
+ *idx = i;
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding
+ * vnode, if found.
+ *
+ * First, we search statically defined entries, if any, with a call to
+ * gfs_dir_lookup_static(). If no static entry is found, and we have
+ * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
+ *
+ * This function returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ int flags, int *direntflags, pathname_t *realpnp)
+{
+ gfs_dir_t *dp = dvp->v_data;
+ boolean_t casecheck;
+ vnode_t *dynvp = NULL;
+ vnode_t *vp = NULL;
+ int (*compare)(const char *, const char *);
+ int error, idx;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+ return (0);
+
+ casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
+ if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
+ (flags & FIGNORECASE))
+ compare = strcasecmp;
+ else
+ compare = strcmp;
+
+ gfs_dir_lock(dp);
+
+ error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
+
+ if (vp && casecheck) {
+ gfs_dirent_t *ge;
+ int i;
+
+ for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
+ ge = &dp->gfsd_static[i];
+
+ if (strcasecmp(ge->gfse_name, nm) == 0) {
+ *direntflags |= ED_CASE_CONFLICT;
+ goto out;
+ }
+ }
+ }
+
+ if ((error || casecheck) && dp->gfsd_lookup)
+ error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
+ &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
+
+ if (vp && dynvp) {
+ /* static and dynamic entries are case-insensitive conflict */
+ ASSERT(casecheck);
+ *direntflags |= ED_CASE_CONFLICT;
+ VN_RELE(dynvp);
+ } else if (vp == NULL) {
+ vp = dynvp;
+ } else if (error == ENOENT) {
+ error = 0;
+ } else if (error) {
+ VN_RELE(vp);
+ vp = NULL;
+ }
+
+out:
+ gfs_dir_unlock(dp);
+
+ *vpp = vp;
+ return (error);
+}
+
+/*
+ * gfs_dir_readdir: does a readdir() on the given directory
+ *
+ * dvp - directory vnode
+ * uiop - uio structure
+ * eofp - eof pointer
+ * data - arbitrary data passed to readdir callback
+ *
+ * This routine does all the readdir() dirty work. Even so, the caller must
+ * supply two callbacks in order to get full compatibility.
+ *
+ * If the directory contains static entries, an inode callback must be
+ * specified. This avoids having to create every vnode and call VOP_GETATTR()
+ * when reading the directory. This function has the following arguments:
+ *
+ * ino_t gfs_inode_cb(vnode_t *vp, int index);
+ *
+ * vp - vnode for the directory
+ * index - index in original gfs_dirent_t array
+ *
+ * Returns the inode number for the given entry.
+ *
+ * For directories with dynamic entries, a readdir callback must be provided.
+ * This is significantly more complex, thanks to the particulars of
+ * VOP_READDIR().
+ *
+ * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+ * offset_t *off, offset_t *nextoff, void *data, int flags)
+ *
+ * vp - directory vnode
+ * dp - directory entry, sized according to maxlen given to
+ * gfs_dir_create(). callback must fill in d_name and
+ * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
+ * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
+ * is set in 'flags'.
+ * eofp - callback must set to 1 when EOF has been reached
+ * off - on entry, the last offset read from the directory. Callback
+ * must set to the offset of the current entry, typically left
+ * untouched.
+ * nextoff - callback must set to offset of next entry. Typically
+ * (off + 1)
+ * data - caller-supplied data
+ * flags - VOP_READDIR flags
+ *
+ * Return 0 on success, or error on failure.
+ */
+int
+gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ gfs_readdir_state_t gstate;
+ int error, eof = 0;
+ ino64_t ino, pino;
+ offset_t off, next;
+ gfs_dir_t *dp = dvp->v_data;
+
+ error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
+ if (error)
+ return (error);
+
+ if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
+ pino, ino, flags)) != 0)
+ return (error);
+
+ while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
+ !eof) {
+
+ if (off >= 0 && off < dp->gfsd_nstatic) {
+ ino = dp->gfsd_inode(dvp, off);
+
+ if ((error = gfs_readdir_emit(&gstate, uiop,
+ off, ino, dp->gfsd_static[off].gfse_name, 0))
+ != 0)
+ break;
+
+ } else if (dp->gfsd_readdir) {
+ off -= dp->gfsd_nstatic;
+
+ if ((error = dp->gfsd_readdir(dvp,
+ gstate.grd_dirent, &eof, &off, &next,
+ data, flags)) != 0 || eof)
+ break;
+
+ off += dp->gfsd_nstatic + 2;
+ next += dp->gfsd_nstatic + 2;
+
+ if ((error = gfs_readdir_emit_int(&gstate, uiop,
+ next)) != 0)
+ break;
+ } else {
+ /*
+ * Offset is beyond the end of the static entries, and
+ * we have no dynamic entries. Set EOF.
+ */
+ eof = 1;
+ }
+ }
+
+ return (gfs_readdir_fini(&gstate, error, eofp, eof));
+}
+
+
+/*
+ * gfs_vop_lookup: VOP_LOOKUP() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_lookup() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
+}
+
+/*
+ * gfs_vop_readdir: VOP_READDIR() entry point
+ *
+ * For use directly in vnode ops table. Given a GFS directory, calls
+ * gfs_dir_readdir() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
+}
+
+
+/*
+ * gfs_vop_map: VOP_MAP() entry point
+ *
+ * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
+ * This function only works for readonly files, and uses the read function for
+ * the vnode to fill in the data. The mapped data is immediately faulted in and
+ * filled with the necessary data during this call; there are no getpage() or
+ * putpage() routines.
+ */
+/* ARGSUSED */
+int
+gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
+ caller_context_t *ct)
+{
+ int rv;
+ ssize_t resid = len;
+
+ /*
+ * Check for bad parameters
+ */
+#ifdef _ILP32
+ if (len > MAXOFF_T)
+ return (ENOMEM);
+#endif
+ if (vp->v_flag & VNOMAP)
+ return (ENOTSUP);
+ if (off > MAXOFF_T)
+ return (EFBIG);
+ if ((long)off < 0 || (long)(off + len) < 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+ if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
+ return (EACCES);
+
+ /*
+ * Find appropriate address if needed, otherwise clear address range.
+ */
+ as_rangelock(as);
+ rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (rv != 0) {
+ as_rangeunlock(as);
+ return (rv);
+ }
+
+ /*
+ * Create mapping
+ */
+ rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
+ as_rangeunlock(as);
+ if (rv != 0)
+ return (rv);
+
+ /*
+ * Fill with data from read()
+ */
+ rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
+ 0, (rlim64_t)0, cred, &resid);
+
+ if (rv == 0 && resid != 0)
+ rv = ENXIO;
+
+ if (rv != 0) {
+ as_rangelock(as);
+ (void) as_unmap(as, *addrp, len);
+ as_rangeunlock(as);
+ }
+
+ return (rv);
+}
+
+/*
+ * gfs_vop_inactive: VOP_INACTIVE() entry point
+ *
+ * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
+ * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
+ */
+/* ARGSUSED */
+void
+gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ gfs_file_t *fp = vp->v_data;
+ void *data;
+
+ if (fp->gfs_type == GFS_DIR)
+ data = gfs_dir_inactive(vp);
+ else
+ data = gfs_file_inactive(vp);
+
+ if (data != NULL)
+ kmem_free(data, fp->gfs_size);
+}
diff --git a/uts/common/fs/vnode.c b/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..382369c7fc72
--- /dev/null
+++ b/uts/common/fs/vnode.c
@@ -0,0 +1,4536 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/rwstlock.h>
+#include <sys/fem.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/conf.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <c2/audit.h>
+#include <sys/acl.h>
+#include <sys/nbmlock.h>
+#include <sys/fcntl.h>
+#include <fs/fs_subr.h>
+#include <sys/taskq.h>
+#include <fs/fs_reparse.h>
+
+/* Determine if this vnode is a file that is read-only */
+#define ISROFILE(vp) \
+ ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
+ (vp)->v_type != VFIFO && vn_is_readonly(vp))
+
+/* Tunable via /etc/system; used only by admin/install */
+int nfs_global_client_only;
+
+/*
+ * Array of vopstats_t for per-FS-type vopstats. This array has the same
+ * number of entries as and parallel to the vfssw table. (Arguably, it could
+ * be part of the vfssw table.) Once it's initialized, it's accessed using
+ * the same fstype index that is used to index into the vfssw table.
+ */
+vopstats_t **vopstats_fstype;
+
+/* vopstats initialization template used for fast initialization via bcopy() */
+static vopstats_t *vs_templatep;
+
+/* Kmem cache handle for vsk_anchor_t allocations */
+kmem_cache_t *vsk_anchor_cache;
+
+/* file events cleanup routine */
+extern void free_fopdata(vnode_t *);
+
+/*
+ * Root of AVL tree for the kstats associated with vopstats. Lock protects
+ * updates to vsktat_tree.
+ */
+avl_tree_t vskstat_tree;
+kmutex_t vskstat_tree_lock;
+
+/* Global variable which enables/disables the vopstats collection */
+int vopstats_enabled = 1;
+
+/*
+ * forward declarations for internal vnode specific data (vsd)
+ */
+static void *vsd_realloc(void *, size_t, size_t);
+
+/*
+ * forward declarations for reparse point functions
+ */
+static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
+
+/*
+ * VSD -- VNODE SPECIFIC DATA
+ * The v_data pointer is typically used by a file system to store a
+ * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
+ * However, there are times when additional project private data needs
+ * to be stored separately from the data (node) pointed to by v_data.
+ * This additional data could be stored by the file system itself or
+ * by a completely different kernel entity. VSD provides a way for
+ * callers to obtain a key and store a pointer to private data associated
+ * with a vnode.
+ *
+ * Callers are responsible for protecting the vsd by holding v_vsd_lock
+ * for calls to vsd_set() and vsd_get().
+ */
+
+/*
+ * vsd_lock protects:
+ * vsd_nkeys - creation and deletion of vsd keys
+ * vsd_list - insertion and deletion of vsd_node in the vsd_list
+ * vsd_destructor - adding and removing destructors to the list
+ */
+static kmutex_t vsd_lock;
+static uint_t vsd_nkeys; /* size of destructor array */
+/* list of vsd_node's */
+static list_t *vsd_list = NULL;
+/* per-key destructor funcs */
+static void (**vsd_destructor)(void *);
+
+/*
+ * The following is the common set of actions needed to update the
+ * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
+ * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
+ * recording of the bytes transferred. Since the code is similar
+ * but small, it is nearly a duplicate. Consequently any changes
+ * to one may need to be reflected in the other.
+ * Rundown of the variables:
+ * vp - Pointer to the vnode
+ * counter - Partial name structure member to update in vopstats for counts
+ * bytecounter - Partial name structure member to update in vopstats for bytes
+ * bytesval - Value to update in vopstats for bytes
+ * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
+ * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
+ */
+
+#define VOPSTATS_UPDATE(vp, counter) { \
+ vfs_t *vfsp = (vp)->v_vfsp; \
+ if (vfsp && vfsp->vfs_implp && \
+ (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
+ vopstats_t *vsp = &vfsp->vfs_vopstats; \
+ uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
+ extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
+ size_t, uint64_t *); \
+ __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
+ (*stataddr)++; \
+ if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
+ vsp->n##counter.value.ui64++; \
+ } \
+ } \
+}
+
+#define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
+ vfs_t *vfsp = (vp)->v_vfsp; \
+ if (vfsp && vfsp->vfs_implp && \
+ (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
+ vopstats_t *vsp = &vfsp->vfs_vopstats; \
+ uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
+ extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
+ size_t, uint64_t *); \
+ __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
+ (*stataddr)++; \
+ vsp->bytecounter.value.ui64 += bytesval; \
+ if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
+ vsp->n##counter.value.ui64++; \
+ vsp->bytecounter.value.ui64 += bytesval; \
+ } \
+ } \
+}
+
+/*
+ * If the filesystem does not support XIDs map credential
+ * If the vfsp is NULL, perhaps we should also map?
+ */
+#define VOPXID_MAP_CR(vp, cr) { \
+ vfs_t *vfsp = (vp)->v_vfsp; \
+ if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
+ cr = crgetmapped(cr); \
+ }
+
+/*
+ * Convert stat(2) formats to vnode types and vice versa. (Knows about
+ * numerical order of S_IFMT and vnode types.)
+ */
+enum vtype iftovt_tab[] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+
+ushort_t vttoif_tab[] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
+ S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
+};
+
+/*
+ * The system vnode cache.
+ */
+
+kmem_cache_t *vn_cache;
+
+
+/*
+ * Vnode operations vector.
+ */
+
+static const fs_operation_trans_def_t vn_ops_table[] = {
+ VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_READ, offsetof(struct vnodeops, vop_read),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
+ fs_setfl, fs_nosys,
+
+ VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
+ fs_rwlock, fs_rwlock,
+
+ VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
+ (fs_generic_func_p) fs_rwunlock,
+ (fs_generic_func_p) fs_rwunlock, /* no errors allowed */
+
+ VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
+ fs_cmp, fs_cmp, /* no errors allowed */
+
+ VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
+ fs_frlock, fs_nosys,
+
+ VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
+ (fs_generic_func_p) fs_nosys_map,
+ (fs_generic_func_p) fs_nosys_map,
+
+ VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
+ (fs_generic_func_p) fs_nosys_addmap,
+ (fs_generic_func_p) fs_nosys_addmap,
+
+ VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
+ (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
+
+ VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
+ fs_pathconf, fs_nosys,
+
+ VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
+ (fs_generic_func_p) fs_dispose,
+ (fs_generic_func_p) fs_nodispose,
+
+ VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
+ fs_fab_acl, fs_nosys,
+
+ VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
+ fs_shrlock, fs_nosys,
+
+ VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
+ (fs_generic_func_p) fs_vnevent_nosupport,
+ (fs_generic_func_p) fs_vnevent_nosupport,
+
+ VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
+ fs_nosys, fs_nosys,
+
+ NULL, 0, NULL, NULL
+};
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+ bzero(xvap, sizeof (xvattr_t));
+ xvap->xva_mapsize = XVA_MAPSIZE;
+ xvap->xva_magic = XVA_MAGIC;
+ xvap->xva_vattr.va_mask = AT_XVATTR;
+ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure. Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+ xoptattr_t *xoap = NULL;
+ if (xvap->xva_vattr.va_mask & AT_XVATTR)
+ xoap = &xvap->xva_xoptattrs;
+ return (xoap);
+}
+
+/*
+ * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
+ * We use the f_fsid reported by VFS_STATVFS() since we use that for the
+ * kstat name.
+ */
+static int
+vska_compar(const void *n1, const void *n2)
+{
+ int ret;
+ ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
+ ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
+
+ if (p1 < p2) {
+ ret = -1;
+ } else if (p1 > p2) {
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * Used to create a single template which will be bcopy()ed to a newly
+ * allocated vsanchor_combo_t structure in new_vsanchor(), below.
+ */
+static vopstats_t *
+create_vopstats_template()
+{
+ vopstats_t *vsp;
+
+ vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
+ bzero(vsp, sizeof (*vsp)); /* Start fresh */
+
+ /* VOP_OPEN */
+ kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
+ /* VOP_CLOSE */
+ kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
+ /* VOP_READ I/O */
+ kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
+ /* VOP_WRITE I/O */
+ kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
+ /* VOP_IOCTL */
+ kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
+ /* VOP_SETFL */
+ kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
+ /* VOP_GETATTR */
+ kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
+ /* VOP_SETATTR */
+ kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
+ /* VOP_ACCESS */
+ kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
+ /* VOP_LOOKUP */
+ kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
+ /* VOP_CREATE */
+ kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
+ /* VOP_REMOVE */
+ kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
+ /* VOP_LINK */
+ kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
+ /* VOP_RENAME */
+ kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
+ /* VOP_MKDIR */
+ kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
+ /* VOP_RMDIR */
+ kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
+ /* VOP_READDIR I/O */
+ kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
+ KSTAT_DATA_UINT64);
+ /* VOP_SYMLINK */
+ kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
+ /* VOP_READLINK */
+ kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
+ /* VOP_FSYNC */
+ kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
+ /* VOP_INACTIVE */
+ kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
+ /* VOP_FID */
+ kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
+ /* VOP_RWLOCK */
+ kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
+ /* VOP_RWUNLOCK */
+ kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
+ /* VOP_SEEK */
+ kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
+ /* VOP_CMP */
+ kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
+ /* VOP_FRLOCK */
+ kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
+ /* VOP_SPACE */
+ kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
+ /* VOP_REALVP */
+ kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
+ /* VOP_GETPAGE */
+ kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
+ /* VOP_PUTPAGE */
+ kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
+ /* VOP_MAP */
+ kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
+ /* VOP_ADDMAP */
+ kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
+ /* VOP_DELMAP */
+ kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
+ /* VOP_POLL */
+ kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
+ /* VOP_DUMP */
+ kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
+ /* VOP_PATHCONF */
+ kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
+ /* VOP_PAGEIO */
+ kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
+ /* VOP_DUMPCTL */
+ kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
+ /* VOP_DISPOSE */
+ kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
+ /* VOP_SETSECATTR */
+ kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
+ /* VOP_GETSECATTR */
+ kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
+ /* VOP_SHRLOCK */
+ kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
+ /* VOP_VNEVENT */
+ kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
+ /* VOP_REQZCBUF */
+ kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
+ /* VOP_RETZCBUF */
+ kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
+
+ return (vsp);
+}
+
+/*
+ * Creates a kstat structure associated with a vopstats structure.
+ */
+kstat_t *
+new_vskstat(char *ksname, vopstats_t *vsp)
+{
+ kstat_t *ksp;
+
+ if (!vopstats_enabled) {
+ return (NULL);
+ }
+
+ ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
+ sizeof (vopstats_t)/sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
+ if (ksp) {
+ ksp->ks_data = vsp;
+ kstat_install(ksp);
+ }
+
+ return (ksp);
+}
+
+/*
+ * Called from vfsinit() to initialize the support mechanisms for vopstats
+ */
+void
+vopstats_startup()
+{
+ if (!vopstats_enabled)
+ return;
+
+ /*
+ * Creates the AVL tree which holds per-vfs vopstat anchors. This
+ * is necessary since we need to check if a kstat exists before we
+ * attempt to create it. Also, initialize its lock.
+ */
+ avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
+ offsetof(vsk_anchor_t, vsk_node));
+ mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
+ sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
+ NULL, NULL, 0);
+
+ /*
+ * Set up the array of pointers for the vopstats-by-FS-type.
+ * The entries will be allocated/initialized as each file system
+ * goes through modload/mod_installfs.
+ */
+ vopstats_fstype = (vopstats_t **)kmem_zalloc(
+ (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
+
+ /* Set up the global vopstats initialization template */
+ vs_templatep = create_vopstats_template();
+}
+
+/*
+ * We need to have the all of the counters zeroed.
+ * The initialization of the vopstats_t includes on the order of
+ * 50 calls to kstat_named_init(). Rather that do that on every call,
+ * we do it once in a template (vs_templatep) then bcopy it over.
+ */
+void
+initialize_vopstats(vopstats_t *vsp)
+{
+ if (vsp == NULL)
+ return;
+
+ bcopy(vs_templatep, vsp, sizeof (vopstats_t));
+}
+
+/*
+ * If possible, determine which vopstats by fstype to use and
+ * return a pointer to the caller.
+ */
+vopstats_t *
+get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
+{
+ int fstype = 0; /* Index into vfssw[] */
+ vopstats_t *vsp = NULL;
+
+ if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
+ !vopstats_enabled)
+ return (NULL);
+ /*
+ * Set up the fstype. We go to so much trouble because all versions
+ * of NFS use the same fstype in their vfs even though they have
+ * distinct entries in the vfssw[] table.
+ * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
+ */
+ if (vswp) {
+ fstype = vswp - vfssw; /* Gets us the index */
+ } else {
+ fstype = vfsp->vfs_fstype;
+ }
+
+ /*
+ * Point to the per-fstype vopstats. The only valid values are
+ * non-zero positive values less than the number of vfssw[] table
+ * entries.
+ */
+ if (fstype > 0 && fstype < nfstype) {
+ vsp = vopstats_fstype[fstype];
+ }
+
+ return (vsp);
+}
+
+/*
+ * Generate a kstat name, create the kstat structure, and allocate a
+ * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
+ * to the caller. This must only be called from a mount.
+ */
+vsk_anchor_t *
+get_vskstat_anchor(vfs_t *vfsp)
+{
+ char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
+ statvfs64_t statvfsbuf; /* Needed to find f_fsid */
+ vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
+ kstat_t *ksp; /* Ptr to new kstat */
+ avl_index_t where; /* Location in the AVL tree */
+
+ if (vfsp == NULL || vfsp->vfs_implp == NULL ||
+ (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
+ return (NULL);
+
+ /* Need to get the fsid to build a kstat name */
+ if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
+ /* Create a name for our kstats based on fsid */
+ (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
+ VOPSTATS_STR, statvfsbuf.f_fsid);
+
+ /* Allocate and initialize the vsk_anchor_t */
+ vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
+ bzero(vskp, sizeof (*vskp));
+ vskp->vsk_fsid = statvfsbuf.f_fsid;
+
+ mutex_enter(&vskstat_tree_lock);
+ if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
+ avl_insert(&vskstat_tree, vskp, where);
+ mutex_exit(&vskstat_tree_lock);
+
+ /*
+ * Now that we've got the anchor in the AVL
+ * tree, we can create the kstat.
+ */
+ ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
+ if (ksp) {
+ vskp->vsk_ksp = ksp;
+ }
+ } else {
+ /* Oops, found one! Release memory and lock. */
+ mutex_exit(&vskstat_tree_lock);
+ kmem_cache_free(vsk_anchor_cache, vskp);
+ vskp = NULL;
+ }
+ }
+ return (vskp);
+}
+
+/*
+ * We're in the process of tearing down the vfs and need to cleanup
+ * the data structures associated with the vopstats. Must only be called
+ * from dounmount().
+ */
+void
+teardown_vopstats(vfs_t *vfsp)
+{
+ vsk_anchor_t *vskap;
+ avl_index_t where;
+
+ if (vfsp == NULL || vfsp->vfs_implp == NULL ||
+ (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
+ return;
+
+ /* This is a safe check since VFS_STATS must be set (see above) */
+ if ((vskap = vfsp->vfs_vskap) == NULL)
+ return;
+
+ /* Whack the pointer right away */
+ vfsp->vfs_vskap = NULL;
+
+ /* Lock the tree, remove the node, and delete the kstat */
+ mutex_enter(&vskstat_tree_lock);
+ if (avl_find(&vskstat_tree, vskap, &where)) {
+ avl_remove(&vskstat_tree, vskap);
+ }
+
+ if (vskap->vsk_ksp) {
+ kstat_delete(vskap->vsk_ksp);
+ }
+ mutex_exit(&vskstat_tree_lock);
+
+ kmem_cache_free(vsk_anchor_cache, vskap);
+}
+
+/*
+ * Read or write a vnode. Called from kernel code.
+ */
+int
+vn_rdwr(
+ enum uio_rw rw,
+ struct vnode *vp,
+ caddr_t base,
+ ssize_t len,
+ offset_t offset,
+ enum uio_seg seg,
+ int ioflag,
+ rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
+ cred_t *cr,
+ ssize_t *residp)
+{
+ struct uio uio;
+ struct iovec iov;
+ int error;
+ int in_crit = 0;
+
+ if (rw == UIO_WRITE && ISROFILE(vp))
+ return (EROFS);
+
+ if (len < 0)
+ return (EIO);
+
+ VOPXID_MAP_CR(vp, cr);
+
+ iov.iov_base = base;
+ iov.iov_len = len;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_loffset = offset;
+ uio.uio_segflg = (short)seg;
+ uio.uio_resid = len;
+ uio.uio_llimit = ulimit;
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, cr, &svmand);
+ if (error != 0)
+ goto done;
+ if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
+ uio.uio_offset, uio.uio_resid, svmand, NULL)) {
+ error = EACCES;
+ goto done;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp,
+ rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
+ if (rw == UIO_WRITE) {
+ uio.uio_fmode = FWRITE;
+ uio.uio_extflg = UIO_COPY_DEFAULT;
+ error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
+ } else {
+ uio.uio_fmode = FREAD;
+ uio.uio_extflg = UIO_COPY_CACHED;
+ error = VOP_READ(vp, &uio, ioflag, cr, NULL);
+ }
+ VOP_RWUNLOCK(vp,
+ rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
+ if (residp)
+ *residp = uio.uio_resid;
+ else if (uio.uio_resid)
+ error = EIO;
+
+done:
+ if (in_crit)
+ nbl_end_crit(vp);
+ return (error);
+}
+
+/*
+ * Release a vnode. Call VOP_INACTIVE on last reference or
+ * decrement reference count.
+ *
+ * To avoid race conditions, the v_count is left at 1 for
+ * the call to VOP_INACTIVE. This prevents another thread
+ * from reclaiming and releasing the vnode *before* the
+ * VOP_INACTIVE routine has a chance to destroy the vnode.
+ * We can't have more than 1 thread calling VOP_INACTIVE
+ * on a vnode.
+ */
+void
+vn_rele(vnode_t *vp)
+{
+ VERIFY(vp->v_count > 0);
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count == 1) {
+ mutex_exit(&vp->v_lock);
+ VOP_INACTIVE(vp, CRED(), NULL);
+ return;
+ }
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+}
+
+/*
+ * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
+ * as a single reference, so v_count is not decremented until the last DNLC hold
+ * is released. This makes it possible to distinguish vnodes that are referenced
+ * only by the DNLC.
+ */
+void
+vn_rele_dnlc(vnode_t *vp)
+{
+ VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
+ mutex_enter(&vp->v_lock);
+ if (--vp->v_count_dnlc == 0) {
+ if (vp->v_count == 1) {
+ mutex_exit(&vp->v_lock);
+ VOP_INACTIVE(vp, CRED(), NULL);
+ return;
+ }
+ vp->v_count--;
+ }
+ mutex_exit(&vp->v_lock);
+}
+
+/*
+ * Like vn_rele() except that it clears v_stream under v_lock.
+ * This is used by sockfs when it dismantels the association between
+ * the sockfs node and the vnode in the underlaying file system.
+ * v_lock has to be held to prevent a thread coming through the lookupname
+ * path from accessing a stream head that is going away.
+ */
+void
+vn_rele_stream(vnode_t *vp)
+{
+ VERIFY(vp->v_count > 0);
+ mutex_enter(&vp->v_lock);
+ vp->v_stream = NULL;
+ if (vp->v_count == 1) {
+ mutex_exit(&vp->v_lock);
+ VOP_INACTIVE(vp, CRED(), NULL);
+ return;
+ }
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+}
+
+static void
+vn_rele_inactive(vnode_t *vp)
+{
+ VOP_INACTIVE(vp, CRED(), NULL);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+ VERIFY(vp->v_count > 0);
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count == 1) {
+ mutex_exit(&vp->v_lock);
+ VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
+ vp, TQ_SLEEP) != NULL);
+ return;
+ }
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+}
+
+int
+vn_open(
+ char *pnamep,
+ enum uio_seg seg,
+ int filemode,
+ int createmode,
+ struct vnode **vpp,
+ enum create crwhy,
+ mode_t umask)
+{
+ return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
+ umask, NULL, -1));
+}
+
+
+/*
+ * Open/create a vnode.
+ * This may be callable by the kernel, the only known use
+ * of user context being that the current user credentials
+ * are used for permissions. crwhy is defined iff filemode & FCREAT.
+ */
+int
+vn_openat(
+ char *pnamep,
+ enum uio_seg seg,
+ int filemode,
+ int createmode,
+ struct vnode **vpp,
+ enum create crwhy,
+ mode_t umask,
+ struct vnode *startvp,
+ int fd)
+{
+ struct vnode *vp;
+ int mode;
+ int accessflags;
+ int error;
+ int in_crit = 0;
+ int open_done = 0;
+ int shrlock_done = 0;
+ struct vattr vattr;
+ enum symfollow follow;
+ int estale_retry = 0;
+ struct shrlock shr;
+ struct shr_locowner shr_own;
+
+ mode = 0;
+ accessflags = 0;
+ if (filemode & FREAD)
+ mode |= VREAD;
+ if (filemode & (FWRITE|FTRUNC))
+ mode |= VWRITE;
+ if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
+ mode |= VEXEC;
+
+ /* symlink interpretation */
+ if (filemode & FNOFOLLOW)
+ follow = NO_FOLLOW;
+ else
+ follow = FOLLOW;
+
+ if (filemode & FAPPEND)
+ accessflags |= V_APPEND;
+
+top:
+ if (filemode & FCREAT) {
+ enum vcexcl excl;
+
+ /*
+ * Wish to create a file.
+ */
+ vattr.va_type = VREG;
+ vattr.va_mode = createmode;
+ vattr.va_mask = AT_TYPE|AT_MODE;
+ if (filemode & FTRUNC) {
+ vattr.va_size = 0;
+ vattr.va_mask |= AT_SIZE;
+ }
+ if (filemode & FEXCL)
+ excl = EXCL;
+ else
+ excl = NONEXCL;
+
+ if (error =
+ vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
+ (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
+ return (error);
+ } else {
+ /*
+ * Wish to open a file. Just look it up.
+ */
+ if (error = lookupnameat(pnamep, seg, follow,
+ NULLVPP, &vp, startvp)) {
+ if ((error == ESTALE) &&
+ fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+ }
+
+ /*
+ * Get the attributes to check whether file is large.
+ * We do this only if the FOFFMAX flag is not set and
+ * only for regular files.
+ */
+
+ if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0,
+ CRED(), NULL))) {
+ goto out;
+ }
+ if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
+ /*
+ * Large File API - regular open fails
+ * if FOFFMAX flag is set in file mode
+ */
+ error = EOVERFLOW;
+ goto out;
+ }
+ }
+ /*
+ * Can't write directories, active texts, or
+ * read-only filesystems. Can't truncate files
+ * on which mandatory locking is in effect.
+ */
+ if (filemode & (FWRITE|FTRUNC)) {
+ /*
+ * Allow writable directory if VDIROPEN flag is set.
+ */
+ if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
+ error = EISDIR;
+ goto out;
+ }
+ if (ISROFILE(vp)) {
+ error = EROFS;
+ goto out;
+ }
+ /*
+ * Can't truncate files on which
+ * sysv mandatory locking is in effect.
+ */
+ if (filemode & FTRUNC) {
+ vnode_t *rvp;
+
+ if (VOP_REALVP(vp, &rvp, NULL) != 0)
+ rvp = vp;
+ if (rvp->v_filocks != NULL) {
+ vattr.va_mask = AT_MODE;
+ if ((error = VOP_GETATTR(vp,
+ &vattr, 0, CRED(), NULL)) == 0 &&
+ MANDLOCK(vp, vattr.va_mode))
+ error = EAGAIN;
+ }
+ }
+ if (error)
+ goto out;
+ }
+ /*
+ * Check permissions.
+ */
+ if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
+ goto out;
+ /*
+ * Require FSEARCH to return a directory.
+ * Require FEXEC to return a regular file.
+ */
+ if ((filemode & FSEARCH) && vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ if ((filemode & FEXEC) && vp->v_type != VREG) {
+ error = ENOEXEC; /* XXX: error code? */
+ goto out;
+ }
+ }
+
+ /*
+ * Do remaining checks for FNOFOLLOW and FNOLINKS.
+ */
+ if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
+ error = ELOOP;
+ goto out;
+ }
+ if (filemode & FNOLINKS) {
+ vattr.va_mask = AT_NLINK;
+ if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
+ goto out;
+ }
+ if (vattr.va_nlink != 1) {
+ error = EMLINK;
+ goto out;
+ }
+ }
+
+ /*
+ * Opening a socket corresponding to the AF_UNIX pathname
+ * in the filesystem name space is not supported.
+ * However, VSOCK nodes in namefs are supported in order
+ * to make fattach work for sockets.
+ *
+ * XXX This uses VOP_REALVP to distinguish between
+ * an unopened namefs node (where VOP_REALVP returns a
+ * different VSOCK vnode) and a VSOCK created by vn_create
+ * in some file system (where VOP_REALVP would never return
+ * a different vnode).
+ */
+ if (vp->v_type == VSOCK) {
+ struct vnode *nvp;
+
+ error = VOP_REALVP(vp, &nvp, NULL);
+ if (error != 0 || nvp == NULL || nvp == vp ||
+ nvp->v_type != VSOCK) {
+ error = EOPNOTSUPP;
+ goto out;
+ }
+ }
+
+ if ((vp->v_type == VREG) && nbl_need_check(vp)) {
+ /* get share reservation */
+ shr.s_access = 0;
+ if (filemode & FWRITE)
+ shr.s_access |= F_WRACC;
+ if (filemode & FREAD)
+ shr.s_access |= F_RDACC;
+ shr.s_deny = 0;
+ shr.s_sysid = 0;
+ shr.s_pid = ttoproc(curthread)->p_pid;
+ shr_own.sl_pid = shr.s_pid;
+ shr_own.sl_id = fd;
+ shr.s_own_len = sizeof (shr_own);
+ shr.s_owner = (caddr_t)&shr_own;
+ error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
+ NULL);
+ if (error)
+ goto out;
+ shrlock_done = 1;
+
+ /* nbmand conflict check if truncating file */
+ if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
+ NULL)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * Do opening protocol.
+ */
+ error = VOP_OPEN(&vp, filemode, CRED(), NULL);
+ if (error)
+ goto out;
+ open_done = 1;
+
+ /*
+ * Truncate if required.
+ */
+ if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
+ vattr.va_size = 0;
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
+ goto out;
+ }
+out:
+ ASSERT(vp->v_count > 0);
+
+ if (in_crit) {
+ nbl_end_crit(vp);
+ in_crit = 0;
+ }
+ if (error) {
+ if (open_done) {
+ (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
+ NULL);
+ open_done = 0;
+ shrlock_done = 0;
+ }
+ if (shrlock_done) {
+ (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
+ NULL);
+ shrlock_done = 0;
+ }
+
+ /*
+ * The following clause was added to handle a problem
+ * with NFS consistency. It is possible that a lookup
+ * of the file to be opened succeeded, but the file
+ * itself doesn't actually exist on the server. This
+ * is chiefly due to the DNLC containing an entry for
+ * the file which has been removed on the server. In
+ * this case, we just start over. If there was some
+ * other cause for the ESTALE error, then the lookup
+ * of the file will fail and the error will be returned
+ * above instead of looping around from here.
+ */
+ VN_RELE(vp);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ } else
+ *vpp = vp;
+ return (error);
+}
+
+/*
+ * The following two accessor functions are for the NFSv4 server. Since there
+ * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
+ * vnode open counts correct when a client "upgrades" an open or does an
+ * open_downgrade. In NFS, an upgrade or downgrade can not only change the
+ * open mode (add or subtract read or write), but also change the share/deny
+ * modes. However, share reservations are not integrated with OPEN, yet, so
+ * we need to handle each separately. These functions are cleaner than having
+ * the NFS server manipulate the counts directly, however, nobody else should
+ * use these functions.
+ */
+void
+vn_open_upgrade(
+ vnode_t *vp,
+ int filemode)
+{
+ ASSERT(vp->v_type == VREG);
+
+ if (filemode & FREAD)
+ atomic_add_32(&(vp->v_rdcnt), 1);
+ if (filemode & FWRITE)
+ atomic_add_32(&(vp->v_wrcnt), 1);
+
+}
+
+void
+vn_open_downgrade(
+ vnode_t *vp,
+ int filemode)
+{
+ ASSERT(vp->v_type == VREG);
+
+ if (filemode & FREAD) {
+ ASSERT(vp->v_rdcnt > 0);
+ atomic_add_32(&(vp->v_rdcnt), -1);
+ }
+ if (filemode & FWRITE) {
+ ASSERT(vp->v_wrcnt > 0);
+ atomic_add_32(&(vp->v_wrcnt), -1);
+ }
+
+}
+
+int
+vn_create(
+ char *pnamep,
+ enum uio_seg seg,
+ struct vattr *vap,
+ enum vcexcl excl,
+ int mode,
+ struct vnode **vpp,
+ enum create why,
+ int flag,
+ mode_t umask)
+{
+ return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
+ umask, NULL));
+}
+
+/*
+ * Create a vnode (makenode).
+ */
+int
+vn_createat(
+ char *pnamep,
+ enum uio_seg seg,
+ struct vattr *vap,
+ enum vcexcl excl,
+ int mode,
+ struct vnode **vpp,
+ enum create why,
+ int flag,
+ mode_t umask,
+ struct vnode *startvp)
+{
+ struct vnode *dvp; /* ptr to parent dir vnode */
+ struct vnode *vp = NULL;
+ struct pathname pn;
+ int error;
+ int in_crit = 0;
+ struct vattr vattr;
+ enum symfollow follow;
+ int estale_retry = 0;
+ uint32_t auditing = AU_AUDITING();
+
+ ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ /* symlink interpretation */
+ if ((flag & FNOFOLLOW) || excl == EXCL)
+ follow = NO_FOLLOW;
+ else
+ follow = FOLLOW;
+ flag &= ~(FNOFOLLOW|FNOLINKS);
+
+top:
+ /*
+ * Lookup directory.
+ * If new object is a file, call lower level to create it.
+ * Note that it is up to the lower level to enforce exclusive
+ * creation, if the file is already there.
+ * This allows the lower level to do whatever
+ * locking or protocol that is needed to prevent races.
+ * If the new object is directory call lower level to make
+ * the new directory, with "." and "..".
+ */
+ if (error = pn_get(pnamep, seg, &pn))
+ return (error);
+ if (auditing)
+ audit_vncreate_start();
+ dvp = NULL;
+ *vpp = NULL;
+ /*
+ * lookup will find the parent directory for the vnode.
+ * When it is done the pn holds the name of the entry
+ * in the directory.
+ * If this is a non-exclusive create we also find the node itself.
+ */
+ error = lookuppnat(&pn, NULL, follow, &dvp,
+ (excl == EXCL) ? NULLVPP : vpp, startvp);
+ if (error) {
+ pn_free(&pn);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ if (why == CRMKDIR && error == EINVAL)
+ error = EEXIST; /* SVID */
+ return (error);
+ }
+
+ if (why != CRMKNOD)
+ vap->va_mode &= ~VSVTX;
+
+ /*
+ * If default ACLs are defined for the directory don't apply the
+ * umask if umask is passed.
+ */
+
+ if (umask) {
+
+ vsecattr_t vsec;
+
+ vsec.vsa_aclcnt = 0;
+ vsec.vsa_aclentp = NULL;
+ vsec.vsa_dfaclcnt = 0;
+ vsec.vsa_dfaclentp = NULL;
+ vsec.vsa_mask = VSA_DFACLCNT;
+ error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
+ /*
+ * If error is ENOSYS then treat it as no error
+ * Don't want to force all file systems to support
+ * aclent_t style of ACL's.
+ */
+ if (error == ENOSYS)
+ error = 0;
+ if (error) {
+ if (*vpp != NULL)
+ VN_RELE(*vpp);
+ goto out;
+ } else {
+ /*
+ * Apply the umask if no default ACLs.
+ */
+ if (vsec.vsa_dfaclcnt == 0)
+ vap->va_mode &= ~umask;
+
+ /*
+ * VOP_GETSECATTR() may have allocated memory for
+ * ACLs we didn't request, so double-check and
+ * free it if necessary.
+ */
+ if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
+ kmem_free((caddr_t)vsec.vsa_aclentp,
+ vsec.vsa_aclcnt * sizeof (aclent_t));
+ if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
+ kmem_free((caddr_t)vsec.vsa_dfaclentp,
+ vsec.vsa_dfaclcnt * sizeof (aclent_t));
+ }
+ }
+
+ /*
+ * In general we want to generate EROFS if the file system is
+ * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
+ * documents the open system call, and it says that O_CREAT has no
+ * effect if the file already exists. Bug 1119649 states
+ * that open(path, O_CREAT, ...) fails when attempting to open an
+ * existing file on a read only file system. Thus, the first part
+ * of the following if statement has 3 checks:
+ * if the file exists &&
+ * it is being open with write access &&
+ * the file system is read only
+ * then generate EROFS
+ */
+ if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
+ (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
+ if (*vpp)
+ VN_RELE(*vpp);
+ error = EROFS;
+ } else if (excl == NONEXCL && *vpp != NULL) {
+ vnode_t *rvp;
+
+ /*
+ * File already exists. If a mandatory lock has been
+ * applied, return error.
+ */
+ vp = *vpp;
+ if (VOP_REALVP(vp, &rvp, NULL) != 0)
+ rvp = vp;
+ if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ }
+ if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
+ vattr.va_mask = AT_MODE|AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
+ goto out;
+ }
+ if (MANDLOCK(vp, vattr.va_mode)) {
+ error = EAGAIN;
+ goto out;
+ }
+ /*
+ * File cannot be truncated if non-blocking mandatory
+ * locks are currently on the file.
+ */
+ if ((vap->va_mask & AT_SIZE) && in_crit) {
+ u_offset_t offset;
+ ssize_t length;
+
+ offset = vap->va_size > vattr.va_size ?
+ vattr.va_size : vap->va_size;
+ length = vap->va_size > vattr.va_size ?
+ vap->va_size - vattr.va_size :
+ vattr.va_size - vap->va_size;
+ if (nbl_conflict(vp, NBL_WRITE, offset,
+ length, 0, NULL)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * If the file is the root of a VFS, we've crossed a
+ * mount point and the "containing" directory that we
+ * acquired above (dvp) is irrelevant because it's in
+ * a different file system. We apply VOP_CREATE to the
+ * target itself instead of to the containing directory
+ * and supply a null path name to indicate (conventionally)
+ * the node itself as the "component" of interest.
+ *
+ * The intercession of the file system is necessary to
+ * ensure that the appropriate permission checks are
+ * done.
+ */
+ if (vp->v_flag & VROOT) {
+ ASSERT(why != CRMKDIR);
+ error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
+ CRED(), flag, NULL, NULL);
+ /*
+ * If the create succeeded, it will have created
+ * a new reference to the vnode. Give up the
+ * original reference. The assertion should not
+ * get triggered because NBMAND locks only apply to
+ * VREG files. And if in_crit is non-zero for some
+ * reason, detect that here, rather than when we
+ * deference a null vp.
+ */
+ ASSERT(in_crit == 0);
+ VN_RELE(vp);
+ vp = NULL;
+ goto out;
+ }
+
+ /*
+ * Large File API - non-large open (FOFFMAX flag not set)
+ * of regular file fails if the file size exceeds MAXOFF32_T.
+ */
+ if (why != CRMKDIR &&
+ !(flag & FOFFMAX) &&
+ (vp->v_type == VREG)) {
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0,
+ CRED(), NULL))) {
+ goto out;
+ }
+ if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
+ error = EOVERFLOW;
+ goto out;
+ }
+ }
+ }
+
+ if (error == 0) {
+ /*
+ * Call mkdir() if specified, otherwise create().
+ */
+ int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
+
+ if (why == CRMKDIR)
+ /*
+ * N.B., if vn_createat() ever requests
+ * case-insensitive behavior then it will need
+ * to be passed to VOP_MKDIR(). VOP_CREATE()
+ * will already get it via "flag"
+ */
+ error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
+ NULL, 0, NULL);
+ else if (!must_be_dir)
+ error = VOP_CREATE(dvp, pn.pn_path, vap,
+ excl, mode, vpp, CRED(), flag, NULL, NULL);
+ else
+ error = ENOTDIR;
+ }
+
+out:
+
+ if (auditing)
+ audit_vncreate_finish(*vpp, error);
+ if (in_crit) {
+ nbl_end_crit(vp);
+ in_crit = 0;
+ }
+ if (vp != NULL) {
+ VN_RELE(vp);
+ vp = NULL;
+ }
+ pn_free(&pn);
+ VN_RELE(dvp);
+ /*
+ * The following clause was added to handle a problem
+ * with NFS consistency. It is possible that a lookup
+ * of the file to be created succeeded, but the file
+ * itself doesn't actually exist on the server. This
+ * is chiefly due to the DNLC containing an entry for
+ * the file which has been removed on the server. In
+ * this case, we just start over. If there was some
+ * other cause for the ESTALE error, then the lookup
+ * of the file will fail and the error will be returned
+ * above instead of looping around from here.
+ */
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+}
+
+int
+vn_link(char *from, char *to, enum uio_seg seg)
+{
+ return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
+}
+
+int
+vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
+ vnode_t *tstartvp, char *to, enum uio_seg seg)
+{
+ struct vnode *fvp; /* from vnode ptr */
+ struct vnode *tdvp; /* to directory vnode ptr */
+ struct pathname pn;
+ int error;
+ struct vattr vattr;
+ dev_t fsid;
+ int estale_retry = 0;
+ uint32_t auditing = AU_AUDITING();
+
+top:
+ fvp = tdvp = NULL;
+ if (error = pn_get(to, seg, &pn))
+ return (error);
+ if (auditing && fstartvp != NULL)
+ audit_setfsat_path(1);
+ if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
+ goto out;
+ if (auditing && tstartvp != NULL)
+ audit_setfsat_path(3);
+ if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
+ goto out;
+ /*
+ * Make sure both source vnode and target directory vnode are
+ * in the same vfs and that it is writeable.
+ */
+ vattr.va_mask = AT_FSID;
+ if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
+ goto out;
+ fsid = vattr.va_fsid;
+ vattr.va_mask = AT_FSID;
+ if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
+ goto out;
+ if (fsid != vattr.va_fsid) {
+ error = EXDEV;
+ goto out;
+ }
+ if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ /*
+ * Do the link.
+ */
+ (void) pn_fixslash(&pn);
+ error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
+out:
+ pn_free(&pn);
+ if (fvp)
+ VN_RELE(fvp);
+ if (tdvp)
+ VN_RELE(tdvp);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+}
+
+int
+vn_rename(char *from, char *to, enum uio_seg seg)
+{
+ return (vn_renameat(NULL, from, NULL, to, seg));
+}
+
+int
+vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
+ char *tname, enum uio_seg seg)
+{
+ int error;
+ struct vattr vattr;
+ struct pathname fpn; /* from pathname */
+ struct pathname tpn; /* to pathname */
+ dev_t fsid;
+ int in_crit_src, in_crit_targ;
+ vnode_t *fromvp, *fvp;
+ vnode_t *tovp, *targvp;
+ int estale_retry = 0;
+ uint32_t auditing = AU_AUDITING();
+
+top:
+ fvp = fromvp = tovp = targvp = NULL;
+ in_crit_src = in_crit_targ = 0;
+ /*
+ * Get to and from pathnames.
+ */
+ if (error = pn_get(fname, seg, &fpn))
+ return (error);
+ if (error = pn_get(tname, seg, &tpn)) {
+ pn_free(&fpn);
+ return (error);
+ }
+
+ /*
+ * First we need to resolve the correct directories
+ * The passed in directories may only be a starting point,
+ * but we need the real directories the file(s) live in.
+ * For example the fname may be something like usr/lib/sparc
+ * and we were passed in the / directory, but we need to
+ * use the lib directory for the rename.
+ */
+
+ if (auditing && fdvp != NULL)
+ audit_setfsat_path(1);
+ /*
+ * Lookup to and from directories.
+ */
+ if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
+ goto out;
+ }
+
+ /*
+ * Make sure there is an entry.
+ */
+ if (fvp == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+
+ if (auditing && tdvp != NULL)
+ audit_setfsat_path(3);
+ if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
+ goto out;
+ }
+
+ /*
+ * Make sure both the from vnode directory and the to directory
+ * are in the same vfs and the to directory is writable.
+ * We check fsid's, not vfs pointers, so loopback fs works.
+ */
+ if (fromvp != tovp) {
+ vattr.va_mask = AT_FSID;
+ if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
+ goto out;
+ fsid = vattr.va_fsid;
+ vattr.va_mask = AT_FSID;
+ if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
+ goto out;
+ if (fsid != vattr.va_fsid) {
+ error = EXDEV;
+ goto out;
+ }
+ }
+
+ if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+
+ if (targvp && (fvp != targvp)) {
+ nbl_start_crit(targvp, RW_READER);
+ in_crit_targ = 1;
+ if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ if (nbl_need_check(fvp)) {
+ nbl_start_crit(fvp, RW_READER);
+ in_crit_src = 1;
+ if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ /*
+ * Do the rename.
+ */
+ (void) pn_fixslash(&tpn);
+ error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
+ NULL, 0);
+
+out:
+ pn_free(&fpn);
+ pn_free(&tpn);
+ if (in_crit_src)
+ nbl_end_crit(fvp);
+ if (in_crit_targ)
+ nbl_end_crit(targvp);
+ if (fromvp)
+ VN_RELE(fromvp);
+ if (tovp)
+ VN_RELE(tovp);
+ if (targvp)
+ VN_RELE(targvp);
+ if (fvp)
+ VN_RELE(fvp);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+}
+
+/*
+ * Remove a file or directory.
+ */
+int
+vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
+{
+ return (vn_removeat(NULL, fnamep, seg, dirflag));
+}
+
+int
+vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
+{
+ struct vnode *vp; /* entry vnode */
+ struct vnode *dvp; /* ptr to parent dir vnode */
+ struct vnode *coveredvp;
+ struct pathname pn; /* name of entry */
+ enum vtype vtype;
+ int error;
+ struct vfs *vfsp;
+ struct vfs *dvfsp; /* ptr to parent dir vfs */
+ int in_crit = 0;
+ int estale_retry = 0;
+
+top:
+ if (error = pn_get(fnamep, seg, &pn))
+ return (error);
+ dvp = vp = NULL;
+ if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
+ pn_free(&pn);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+ }
+
+ /*
+ * Make sure there is an entry.
+ */
+ if (vp == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+
+ vfsp = vp->v_vfsp;
+ dvfsp = dvp->v_vfsp;
+
+ /*
+ * If the named file is the root of a mounted filesystem, fail,
+ * unless it's marked unlinkable. In that case, unmount the
+ * filesystem and proceed to unlink the covered vnode. (If the
+ * covered vnode is a directory, use rmdir instead of unlink,
+ * to avoid file system corruption.)
+ */
+ if (vp->v_flag & VROOT) {
+ if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
+ error = EBUSY;
+ goto out;
+ }
+
+ /*
+ * Namefs specific code starts here.
+ */
+
+ if (dirflag == RMDIRECTORY) {
+ /*
+ * User called rmdir(2) on a file that has
+ * been namefs mounted on top of. Since
+ * namefs doesn't allow directories to
+ * be mounted on other files we know
+ * vp is not of type VDIR so fail to operation.
+ */
+ error = ENOTDIR;
+ goto out;
+ }
+
+ /*
+ * If VROOT is still set after grabbing vp->v_lock,
+ * noone has finished nm_unmount so far and coveredvp
+ * is valid.
+ * If we manage to grab vn_vfswlock(coveredvp) before releasing
+ * vp->v_lock, any race window is eliminated.
+ */
+
+ mutex_enter(&vp->v_lock);
+ if ((vp->v_flag & VROOT) == 0) {
+ /* Someone beat us to the unmount */
+ mutex_exit(&vp->v_lock);
+ error = EBUSY;
+ goto out;
+ }
+ vfsp = vp->v_vfsp;
+ coveredvp = vfsp->vfs_vnodecovered;
+ ASSERT(coveredvp);
+ /*
+ * Note: Implementation of vn_vfswlock shows that ordering of
+ * v_lock / vn_vfswlock is not an issue here.
+ */
+ error = vn_vfswlock(coveredvp);
+ mutex_exit(&vp->v_lock);
+
+ if (error)
+ goto out;
+
+ VN_HOLD(coveredvp);
+ VN_RELE(vp);
+ error = dounmount(vfsp, 0, CRED());
+
+ /*
+ * Unmounted the namefs file system; now get
+ * the object it was mounted over.
+ */
+ vp = coveredvp;
+ /*
+ * If namefs was mounted over a directory, then
+ * we want to use rmdir() instead of unlink().
+ */
+ if (vp->v_type == VDIR)
+ dirflag = RMDIRECTORY;
+
+ if (error)
+ goto out;
+ }
+
+ /*
+ * Make sure filesystem is writeable.
+ * We check the parent directory's vfs in case this is an lofs vnode.
+ */
+ if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+
+ vtype = vp->v_type;
+
+ /*
+ * If there is the possibility of an nbmand share reservation, make
+ * sure it's okay to remove the file. Keep a reference to the
+ * vnode, so that we can exit the nbl critical region after
+ * calling VOP_REMOVE.
+ * If there is no possibility of an nbmand share reservation,
+ * release the vnode reference now. Filesystems like NFS may
+ * behave differently if there is an extra reference, so get rid of
+ * this one. Fortunately, we can't have nbmand mounts on NFS
+ * filesystems.
+ */
+ if (nbl_need_check(vp)) {
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
+ error = EACCES;
+ goto out;
+ }
+ } else {
+ VN_RELE(vp);
+ vp = NULL;
+ }
+
+ if (dirflag == RMDIRECTORY) {
+ /*
+ * Caller is using rmdir(2), which can only be applied to
+ * directories.
+ */
+ if (vtype != VDIR) {
+ error = ENOTDIR;
+ } else {
+ vnode_t *cwd;
+ proc_t *pp = curproc;
+
+ mutex_enter(&pp->p_lock);
+ cwd = PTOU(pp)->u_cdir;
+ VN_HOLD(cwd);
+ mutex_exit(&pp->p_lock);
+ error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
+ NULL, 0);
+ VN_RELE(cwd);
+ }
+ } else {
+ /*
+ * Unlink(2) can be applied to anything.
+ */
+ error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
+ }
+
+out:
+ pn_free(&pn);
+ if (in_crit) {
+ nbl_end_crit(vp);
+ in_crit = 0;
+ }
+ if (vp != NULL)
+ VN_RELE(vp);
+ if (dvp != NULL)
+ VN_RELE(dvp);
+ if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+ goto top;
+ return (error);
+}
+
+/*
+ * Utility function to compare equality of vnodes.
+ * Compare the underlying real vnodes, if there are underlying vnodes.
+ * This is a more thorough comparison than the VN_CMP() macro provides.
+ */
+int
+vn_compare(vnode_t *vp1, vnode_t *vp2)
+{
+ vnode_t *realvp;
+
+ if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
+ vp1 = realvp;
+ if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
+ vp2 = realvp;
+ return (VN_CMP(vp1, vp2));
+}
+
+/*
+ * The number of locks to hash into. This value must be a power
+ * of 2 minus 1 and should probably also be prime.
+ */
+#define NUM_BUCKETS 1023
+
+struct vn_vfslocks_bucket {
+ kmutex_t vb_lock;
+ vn_vfslocks_entry_t *vb_list;
+ char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
+};
+
+/*
+ * Total number of buckets will be NUM_BUCKETS + 1 .
+ */
+
+#pragma align 64(vn_vfslocks_buckets)
+static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
+
+#define VN_VFSLOCKS_SHIFT 9
+
+#define VN_VFSLOCKS_HASH(vfsvpptr) \
+ ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
+
+/*
+ * vn_vfslocks_getlock() uses an HASH scheme to generate
+ * rwstlock using vfs/vnode pointer passed to it.
+ *
+ * vn_vfslocks_rele() releases a reference in the
+ * HASH table which allows the entry allocated by
+ * vn_vfslocks_getlock() to be freed at a later
+ * stage when the refcount drops to zero.
+ */
+
+vn_vfslocks_entry_t *
+vn_vfslocks_getlock(void *vfsvpptr)
+{
+ struct vn_vfslocks_bucket *bp;
+ vn_vfslocks_entry_t *vep;
+ vn_vfslocks_entry_t *tvep;
+
+ ASSERT(vfsvpptr != NULL);
+ bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
+
+ mutex_enter(&bp->vb_lock);
+ for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
+ if (vep->ve_vpvfs == vfsvpptr) {
+ vep->ve_refcnt++;
+ mutex_exit(&bp->vb_lock);
+ return (vep);
+ }
+ }
+ mutex_exit(&bp->vb_lock);
+ vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
+ rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
+ vep->ve_vpvfs = (char *)vfsvpptr;
+ vep->ve_refcnt = 1;
+ mutex_enter(&bp->vb_lock);
+ for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
+ if (tvep->ve_vpvfs == vfsvpptr) {
+ tvep->ve_refcnt++;
+ mutex_exit(&bp->vb_lock);
+
+ /*
+ * There is already an entry in the hash
+ * destroy what we just allocated.
+ */
+ rwst_destroy(&vep->ve_lock);
+ kmem_free(vep, sizeof (*vep));
+ return (tvep);
+ }
+ }
+ vep->ve_next = bp->vb_list;
+ bp->vb_list = vep;
+ mutex_exit(&bp->vb_lock);
+ return (vep);
+}
+
+void
+vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
+{
+ struct vn_vfslocks_bucket *bp;
+ vn_vfslocks_entry_t *vep;
+ vn_vfslocks_entry_t *pvep;
+
+ ASSERT(vepent != NULL);
+ ASSERT(vepent->ve_vpvfs != NULL);
+
+ bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
+
+ mutex_enter(&bp->vb_lock);
+ vepent->ve_refcnt--;
+
+ if ((int32_t)vepent->ve_refcnt < 0)
+ cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
+
+ if (vepent->ve_refcnt == 0) {
+ for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
+ if (vep->ve_vpvfs == vepent->ve_vpvfs) {
+ if (bp->vb_list == vep)
+ bp->vb_list = vep->ve_next;
+ else {
+ /* LINTED */
+ pvep->ve_next = vep->ve_next;
+ }
+ mutex_exit(&bp->vb_lock);
+ rwst_destroy(&vep->ve_lock);
+ kmem_free(vep, sizeof (*vep));
+ return;
+ }
+ pvep = vep;
+ }
+ cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
+ }
+ mutex_exit(&bp->vb_lock);
+}
+
+/*
+ * vn_vfswlock_wait is used to implement a lock which is logically a writers
+ * lock protecting the v_vfsmountedhere field.
+ * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
+ * except that it blocks to acquire the lock VVFSLOCK.
+ *
+ * traverse() and routines re-implementing part of traverse (e.g. autofs)
+ * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
+ * need the non-blocking version of the writers lock i.e. vn_vfswlock
+ */
+int
+vn_vfswlock_wait(vnode_t *vp)
+{
+ int retval;
+ vn_vfslocks_entry_t *vpvfsentry;
+ ASSERT(vp != NULL);
+
+ vpvfsentry = vn_vfslocks_getlock(vp);
+ retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
+
+ if (retval == EINTR) {
+ vn_vfslocks_rele(vpvfsentry);
+ return (EINTR);
+ }
+ return (retval);
+}
+
+int
+vn_vfsrlock_wait(vnode_t *vp)
+{
+ int retval;
+ vn_vfslocks_entry_t *vpvfsentry;
+ ASSERT(vp != NULL);
+
+ vpvfsentry = vn_vfslocks_getlock(vp);
+ retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
+
+ if (retval == EINTR) {
+ vn_vfslocks_rele(vpvfsentry);
+ return (EINTR);
+ }
+
+ return (retval);
+}
+
+
+/*
+ * vn_vfswlock is used to implement a lock which is logically a writers lock
+ * protecting the v_vfsmountedhere field.
+ */
+int
+vn_vfswlock(vnode_t *vp)
+{
+ vn_vfslocks_entry_t *vpvfsentry;
+
+ /*
+ * If vp is NULL then somebody is trying to lock the covered vnode
+ * of /. (vfs_vnodecovered is NULL for /). This situation will
+ * only happen when unmounting /. Since that operation will fail
+ * anyway, return EBUSY here instead of in VFS_UNMOUNT.
+ */
+ if (vp == NULL)
+ return (EBUSY);
+
+ vpvfsentry = vn_vfslocks_getlock(vp);
+
+ if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
+ return (0);
+
+ vn_vfslocks_rele(vpvfsentry);
+ return (EBUSY);
+}
+
+int
+vn_vfsrlock(vnode_t *vp)
+{
+ vn_vfslocks_entry_t *vpvfsentry;
+
+ /*
+ * If vp is NULL then somebody is trying to lock the covered vnode
+ * of /. (vfs_vnodecovered is NULL for /). This situation will
+ * only happen when unmounting /. Since that operation will fail
+ * anyway, return EBUSY here instead of in VFS_UNMOUNT.
+ */
+ if (vp == NULL)
+ return (EBUSY);
+
+ vpvfsentry = vn_vfslocks_getlock(vp);
+
+ if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
+ return (0);
+
+ vn_vfslocks_rele(vpvfsentry);
+ return (EBUSY);
+}
+
+void
+vn_vfsunlock(vnode_t *vp)
+{
+ vn_vfslocks_entry_t *vpvfsentry;
+
+ /*
+ * ve_refcnt needs to be decremented twice.
+ * 1. To release refernce after a call to vn_vfslocks_getlock()
+ * 2. To release the reference from the locking routines like
+ * vn_vfsrlock/vn_vfswlock etc,.
+ */
+ vpvfsentry = vn_vfslocks_getlock(vp);
+ vn_vfslocks_rele(vpvfsentry);
+
+ rwst_exit(&vpvfsentry->ve_lock);
+ vn_vfslocks_rele(vpvfsentry);
+}
+
+int
+vn_vfswlock_held(vnode_t *vp)
+{
+ int held;
+ vn_vfslocks_entry_t *vpvfsentry;
+
+ ASSERT(vp != NULL);
+
+ vpvfsentry = vn_vfslocks_getlock(vp);
+ held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
+
+ vn_vfslocks_rele(vpvfsentry);
+ return (held);
+}
+
+
+int
+vn_make_ops(
+ const char *name, /* Name of file system */
+ const fs_operation_def_t *templ, /* Operation specification */
+ vnodeops_t **actual) /* Return the vnodeops */
+{
+ int unused_ops;
+ int error;
+
+ *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
+
+ (*actual)->vnop_name = name;
+
+ error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
+ if (error) {
+ kmem_free(*actual, sizeof (vnodeops_t));
+ }
+
+#if DEBUG
+ if (unused_ops != 0)
+ cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
+ "but not used", name, unused_ops);
+#endif
+
+ return (error);
+}
+
+/*
+ * Free the vnodeops created as a result of vn_make_ops()
+ */
+void
+vn_freevnodeops(vnodeops_t *vnops)
+{
+ kmem_free(vnops, sizeof (vnodeops_t));
+}
+
+/*
+ * Vnode cache.
+ */
+
+/* ARGSUSED */
+static int
+vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct vnode *vp;
+
+ vp = buf;
+
+ mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
+ rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
+ vp->v_femhead = NULL; /* Must be done before vn_reinit() */
+ vp->v_path = NULL;
+ vp->v_mpssdata = NULL;
+ vp->v_vsd = NULL;
+ vp->v_fopdata = NULL;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vn_cache_destructor(void *buf, void *cdrarg)
+{
+ struct vnode *vp;
+
+ vp = buf;
+
+ rw_destroy(&vp->v_nbllock);
+ cv_destroy(&vp->v_cv);
+ mutex_destroy(&vp->v_vsd_lock);
+ mutex_destroy(&vp->v_lock);
+}
+
+void
+vn_create_cache(void)
+{
+ /* LINTED */
+ ASSERT((1 << VNODE_ALIGN_LOG2) ==
+ P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
+ vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
+ VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
+ NULL, 0);
+}
+
+void
+vn_destroy_cache(void)
+{
+ kmem_cache_destroy(vn_cache);
+}
+
+/*
+ * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
+ * cached by the file system and vnodes remain associated.
+ */
+void
+vn_recycle(vnode_t *vp)
+{
+ ASSERT(vp->v_pages == NULL);
+
+ /*
+ * XXX - This really belongs in vn_reinit(), but we have some issues
+ * with the counts. Best to have it here for clean initialization.
+ */
+ vp->v_rdcnt = 0;
+ vp->v_wrcnt = 0;
+ vp->v_mmap_read = 0;
+ vp->v_mmap_write = 0;
+
+ /*
+ * If FEM was in use, make sure everything gets cleaned up
+ * NOTE: vp->v_femhead is initialized to NULL in the vnode
+ * constructor.
+ */
+ if (vp->v_femhead) {
+ /* XXX - There should be a free_femhead() that does all this */
+ ASSERT(vp->v_femhead->femh_list == NULL);
+ mutex_destroy(&vp->v_femhead->femh_lock);
+ kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
+ vp->v_femhead = NULL;
+ }
+ if (vp->v_path) {
+ kmem_free(vp->v_path, strlen(vp->v_path) + 1);
+ vp->v_path = NULL;
+ }
+
+ if (vp->v_fopdata != NULL) {
+ free_fopdata(vp);
+ }
+ vp->v_mpssdata = NULL;
+ vsd_free(vp);
+}
+
+/*
+ * Used to reset the vnode fields including those that are directly accessible
+ * as well as those which require an accessor function.
+ *
+ * Does not initialize:
+ * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
+ * v_data (since FS-nodes and vnodes point to each other and should
+ * be updated simultaneously)
+ * v_op (in case someone needs to make a VOP call on this object)
+ */
+void
+vn_reinit(vnode_t *vp)
+{
+ vp->v_count = 1;
+ vp->v_count_dnlc = 0;
+ vp->v_vfsp = NULL;
+ vp->v_stream = NULL;
+ vp->v_vfsmountedhere = NULL;
+ vp->v_flag = 0;
+ vp->v_type = VNON;
+ vp->v_rdev = NODEV;
+
+ vp->v_filocks = NULL;
+ vp->v_shrlocks = NULL;
+ vp->v_pages = NULL;
+
+ vp->v_locality = NULL;
+ vp->v_xattrdir = NULL;
+
+ /* Handles v_femhead, v_path, and the r/w/map counts */
+ vn_recycle(vp);
+}
+
+vnode_t *
+vn_alloc(int kmflag)
+{
+ vnode_t *vp;
+
+ vp = kmem_cache_alloc(vn_cache, kmflag);
+
+ if (vp != NULL) {
+ vp->v_femhead = NULL; /* Must be done before vn_reinit() */
+ vp->v_fopdata = NULL;
+ vn_reinit(vp);
+ }
+
+ return (vp);
+}
+
+void
+vn_free(vnode_t *vp)
+{
+ ASSERT(vp->v_shrlocks == NULL);
+ ASSERT(vp->v_filocks == NULL);
+
+ /*
+ * Some file systems call vn_free() with v_count of zero,
+ * some with v_count of 1. In any case, the value should
+ * never be anything else.
+ */
+ ASSERT((vp->v_count == 0) || (vp->v_count == 1));
+ ASSERT(vp->v_count_dnlc == 0);
+ if (vp->v_path != NULL) {
+ kmem_free(vp->v_path, strlen(vp->v_path) + 1);
+ vp->v_path = NULL;
+ }
+
+ /* If FEM was in use, make sure everything gets cleaned up */
+ if (vp->v_femhead) {
+ /* XXX - There should be a free_femhead() that does all this */
+ ASSERT(vp->v_femhead->femh_list == NULL);
+ mutex_destroy(&vp->v_femhead->femh_lock);
+ kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
+ vp->v_femhead = NULL;
+ }
+
+ if (vp->v_fopdata != NULL) {
+ free_fopdata(vp);
+ }
+ vp->v_mpssdata = NULL;
+ vsd_free(vp);
+ kmem_cache_free(vn_cache, vp);
+}
+
+/*
+ * vnode status changes, should define better states than 1, 0.
+ */
+void
+vn_reclaim(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+
+ if (vfsp == NULL ||
+ vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
+ return;
+ }
+ (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
+}
+
+void
+vn_idle(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+
+ if (vfsp == NULL ||
+ vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
+ return;
+ }
+ (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
+}
+void
+vn_exists(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+
+ if (vfsp == NULL ||
+ vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
+ return;
+ }
+ (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
+}
+
+void
+vn_invalid(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+
+ if (vfsp == NULL ||
+ vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
+ return;
+ }
+ (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
+}
+
+/* Vnode event notification */
+
+int
+vnevent_support(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL)
+ return (EINVAL);
+
+ return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
+}
+
+void
+vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
+}
+
+void
+vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
+ caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
+}
+
+void
+vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+}
+
+void
+vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
+}
+
+void
+vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
+}
+
+void
+vnevent_create(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
+}
+
+void
+vnevent_link(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
+}
+
+void
+vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
+{
+ if (vp == NULL || vp->v_femhead == NULL) {
+ return;
+ }
+ (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
+}
+
+/*
+ * Vnode accessors.
+ */
+
+int
+vn_is_readonly(vnode_t *vp)
+{
+ return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
+}
+
+int
+vn_has_flocks(vnode_t *vp)
+{
+ return (vp->v_filocks != NULL);
+}
+
+int
+vn_has_mandatory_locks(vnode_t *vp, int mode)
+{
+ return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
+}
+
+int
+vn_has_cached_data(vnode_t *vp)
+{
+ return (vp->v_pages != NULL);
+}
+
+/*
+ * Return 0 if the vnode in question shouldn't be permitted into a zone via
+ * zone_enter(2).
+ */
+int
+vn_can_change_zones(vnode_t *vp)
+{
+ struct vfssw *vswp;
+ int allow = 1;
+ vnode_t *rvp;
+
+ if (nfs_global_client_only != 0)
+ return (1);
+
+ /*
+ * We always want to look at the underlying vnode if there is one.
+ */
+ if (VOP_REALVP(vp, &rvp, NULL) != 0)
+ rvp = vp;
+ /*
+ * Some pseudo filesystems (including doorfs) don't actually register
+ * their vfsops_t, so the following may return NULL; we happily let
+ * such vnodes switch zones.
+ */
+ vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
+ if (vswp != NULL) {
+ if (vswp->vsw_flag & VSW_NOTZONESAFE)
+ allow = 0;
+ vfs_unrefvfssw(vswp);
+ }
+ return (allow);
+}
+
+/*
+ * Return nonzero if the vnode is a mount point, zero if not.
+ */
+int
+vn_ismntpt(vnode_t *vp)
+{
+ return (vp->v_vfsmountedhere != NULL);
+}
+
+/* Retrieve the vfs (if any) mounted on this vnode */
+vfs_t *
+vn_mountedvfs(vnode_t *vp)
+{
+ return (vp->v_vfsmountedhere);
+}
+
+/*
+ * Return nonzero if the vnode is referenced by the dnlc, zero if not.
+ */
+int
+vn_in_dnlc(vnode_t *vp)
+{
+ return (vp->v_count_dnlc > 0);
+}
+
+/*
+ * vn_has_other_opens() checks whether a particular file is opened by more than
+ * just the caller and whether the open is for read and/or write.
+ * This routine is for calling after the caller has already called VOP_OPEN()
+ * and the caller wishes to know if they are the only one with it open for
+ * the mode(s) specified.
+ *
+ * Vnode counts are only kept on regular files (v_type=VREG).
+ */
+int
+vn_has_other_opens(
+ vnode_t *vp,
+ v_mode_t mode)
+{
+
+ ASSERT(vp != NULL);
+
+ switch (mode) {
+ case V_WRITE:
+ if (vp->v_wrcnt > 1)
+ return (V_TRUE);
+ break;
+ case V_RDORWR:
+ if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
+ return (V_TRUE);
+ break;
+ case V_RDANDWR:
+ if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
+ return (V_TRUE);
+ break;
+ case V_READ:
+ if (vp->v_rdcnt > 1)
+ return (V_TRUE);
+ break;
+ }
+
+ return (V_FALSE);
+}
+
+/*
+ * vn_is_opened() checks whether a particular file is opened and
+ * whether the open is for read and/or write.
+ *
+ * Vnode counts are only kept on regular files (v_type=VREG).
+ */
+int
+vn_is_opened(
+ vnode_t *vp,
+ v_mode_t mode)
+{
+
+ ASSERT(vp != NULL);
+
+ switch (mode) {
+ case V_WRITE:
+ if (vp->v_wrcnt)
+ return (V_TRUE);
+ break;
+ case V_RDANDWR:
+ if (vp->v_rdcnt && vp->v_wrcnt)
+ return (V_TRUE);
+ break;
+ case V_RDORWR:
+ if (vp->v_rdcnt || vp->v_wrcnt)
+ return (V_TRUE);
+ break;
+ case V_READ:
+ if (vp->v_rdcnt)
+ return (V_TRUE);
+ break;
+ }
+
+ return (V_FALSE);
+}
+
+/*
+ * vn_is_mapped() checks whether a particular file is mapped and whether
+ * the file is mapped read and/or write.
+ */
+int
+vn_is_mapped(
+ vnode_t *vp,
+ v_mode_t mode)
+{
+
+ ASSERT(vp != NULL);
+
+#if !defined(_LP64)
+ switch (mode) {
+ /*
+ * The atomic_add_64_nv functions force atomicity in the
+ * case of 32 bit architectures. Otherwise the 64 bit values
+ * require two fetches. The value of the fields may be
+ * (potentially) changed between the first fetch and the
+ * second
+ */
+ case V_WRITE:
+ if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
+ return (V_TRUE);
+ break;
+ case V_RDANDWR:
+ if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
+ (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
+ return (V_TRUE);
+ break;
+ case V_RDORWR:
+ if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
+ (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
+ return (V_TRUE);
+ break;
+ case V_READ:
+ if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
+ return (V_TRUE);
+ break;
+ }
+#else
+ switch (mode) {
+ case V_WRITE:
+ if (vp->v_mmap_write)
+ return (V_TRUE);
+ break;
+ case V_RDANDWR:
+ if (vp->v_mmap_read && vp->v_mmap_write)
+ return (V_TRUE);
+ break;
+ case V_RDORWR:
+ if (vp->v_mmap_read || vp->v_mmap_write)
+ return (V_TRUE);
+ break;
+ case V_READ:
+ if (vp->v_mmap_read)
+ return (V_TRUE);
+ break;
+ }
+#endif
+
+ return (V_FALSE);
+}
+
+/*
+ * Set the operations vector for a vnode.
+ *
+ * FEM ensures that the v_femhead pointer is filled in before the
+ * v_op pointer is changed. This means that if the v_femhead pointer
+ * is NULL, and the v_op field hasn't changed since before which checked
+ * the v_femhead pointer; then our update is ok - we are not racing with
+ * FEM.
+ */
+void
+vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
+{
+ vnodeops_t *op;
+
+ ASSERT(vp != NULL);
+ ASSERT(vnodeops != NULL);
+
+ op = vp->v_op;
+ membar_consumer();
+ /*
+ * If vp->v_femhead == NULL, then we'll call casptr() to do the
+ * compare-and-swap on vp->v_op. If either fails, then FEM is
+ * in effect on the vnode and we need to have FEM deal with it.
+ */
+ if (vp->v_femhead != NULL || casptr(&vp->v_op, op, vnodeops) != op) {
+ fem_setvnops(vp, vnodeops);
+ }
+}
+
+/*
+ * Retrieve the operations vector for a vnode
+ * As with vn_setops(above); make sure we aren't racing with FEM.
+ * FEM sets the v_op to a special, internal, vnodeops that wouldn't
+ * make sense to the callers of this routine.
+ */
+vnodeops_t *
+vn_getops(vnode_t *vp)
+{
+ vnodeops_t *op;
+
+ ASSERT(vp != NULL);
+
+ op = vp->v_op;
+ membar_consumer();
+ if (vp->v_femhead == NULL && op == vp->v_op) {
+ return (op);
+ } else {
+ return (fem_getvnops(vp));
+ }
+}
+
+/*
+ * Returns non-zero (1) if the vnodeops matches that of the vnode.
+ * Returns zero (0) if not.
+ */
+int
+vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
+{
+ return (vn_getops(vp) == vnodeops);
+}
+
+/*
+ * Returns non-zero (1) if the specified operation matches the
+ * corresponding operation for that the vnode.
+ * Returns zero (0) if not.
+ */
+
+#define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
+
+int
+vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
+{
+ const fs_operation_trans_def_t *otdp;
+ fs_generic_func_p *loc = NULL;
+ vnodeops_t *vop = vn_getops(vp);
+
+ ASSERT(vopname != NULL);
+
+ for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
+ if (MATCHNAME(otdp->name, vopname)) {
+ loc = (fs_generic_func_p *)
+ ((char *)(vop) + otdp->offset);
+ break;
+ }
+ }
+
+ return ((loc != NULL) && (*loc == funcp));
+}
+
+/*
+ * fs_new_caller_id() needs to return a unique ID on a given local system.
+ * The IDs do not need to survive across reboots. These are primarily
+ * used so that (FEM) monitors can detect particular callers (such as
+ * the NFS server) to a given vnode/vfs operation.
+ */
+u_longlong_t
+fs_new_caller_id()
+{
+ static uint64_t next_caller_id = 0LL; /* First call returns 1 */
+
+ return ((u_longlong_t)atomic_add_64_nv(&next_caller_id, 1));
+}
+
+/*
+ * Given a starting vnode and a path, updates the path in the target vnode in
+ * a safe manner. If the vnode already has path information embedded, then the
+ * cached path is left untouched.
+ */
+
+size_t max_vnode_path = 4 * MAXPATHLEN;
+
+void
+vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
+ const char *path, size_t plen)
+{
+ char *rpath;
+ vnode_t *base;
+ size_t rpathlen, rpathalloc;
+ int doslash = 1;
+
+ if (*path == '/') {
+ base = rootvp;
+ path++;
+ plen--;
+ } else {
+ base = startvp;
+ }
+
+ /*
+ * We cannot grab base->v_lock while we hold vp->v_lock because of
+ * the potential for deadlock.
+ */
+ mutex_enter(&base->v_lock);
+ if (base->v_path == NULL) {
+ mutex_exit(&base->v_lock);
+ return;
+ }
+
+ rpathlen = strlen(base->v_path);
+ rpathalloc = rpathlen + plen + 1;
+ /* Avoid adding a slash if there's already one there */
+ if (base->v_path[rpathlen-1] == '/')
+ doslash = 0;
+ else
+ rpathalloc++;
+
+ /*
+ * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
+ * so we must do this dance. If, by chance, something changes the path,
+ * just give up since there is no real harm.
+ */
+ mutex_exit(&base->v_lock);
+
+ /* Paths should stay within reason */
+ if (rpathalloc > max_vnode_path)
+ return;
+
+ rpath = kmem_alloc(rpathalloc, KM_SLEEP);
+
+ mutex_enter(&base->v_lock);
+ if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
+ mutex_exit(&base->v_lock);
+ kmem_free(rpath, rpathalloc);
+ return;
+ }
+ bcopy(base->v_path, rpath, rpathlen);
+ mutex_exit(&base->v_lock);
+
+ if (doslash)
+ rpath[rpathlen++] = '/';
+ bcopy(path, rpath + rpathlen, plen);
+ rpath[rpathlen + plen] = '\0';
+
+ mutex_enter(&vp->v_lock);
+ if (vp->v_path != NULL) {
+ mutex_exit(&vp->v_lock);
+ kmem_free(rpath, rpathalloc);
+ } else {
+ vp->v_path = rpath;
+ mutex_exit(&vp->v_lock);
+ }
+}
+
+/*
+ * Sets the path to the vnode to be the given string, regardless of current
+ * context. The string must be a complete path from rootdir. This is only used
+ * by fsop_root() for setting the path based on the mountpoint.
+ */
+void
+vn_setpath_str(struct vnode *vp, const char *str, size_t len)
+{
+ char *buf = kmem_alloc(len + 1, KM_SLEEP);
+
+ mutex_enter(&vp->v_lock);
+ if (vp->v_path != NULL) {
+ mutex_exit(&vp->v_lock);
+ kmem_free(buf, len + 1);
+ return;
+ }
+
+ vp->v_path = buf;
+ bcopy(str, vp->v_path, len);
+ vp->v_path[len] = '\0';
+
+ mutex_exit(&vp->v_lock);
+}
+
+/*
+ * Called from within filesystem's vop_rename() to handle renames once the
+ * target vnode is available.
+ */
+void
+vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
+{
+ char *tmp;
+
+ mutex_enter(&vp->v_lock);
+ tmp = vp->v_path;
+ vp->v_path = NULL;
+ mutex_exit(&vp->v_lock);
+ vn_setpath(rootdir, dvp, vp, nm, len);
+ if (tmp != NULL)
+ kmem_free(tmp, strlen(tmp) + 1);
+}
+
+/*
+ * Similar to vn_setpath_str(), this function sets the path of the destination
+ * vnode to the be the same as the source vnode.
+ */
+void
+vn_copypath(struct vnode *src, struct vnode *dst)
+{
+ char *buf;
+ int alloc;
+
+ mutex_enter(&src->v_lock);
+ if (src->v_path == NULL) {
+ mutex_exit(&src->v_lock);
+ return;
+ }
+ alloc = strlen(src->v_path) + 1;
+
+ /* avoid kmem_alloc() with lock held */
+ mutex_exit(&src->v_lock);
+ buf = kmem_alloc(alloc, KM_SLEEP);
+ mutex_enter(&src->v_lock);
+ if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
+ mutex_exit(&src->v_lock);
+ kmem_free(buf, alloc);
+ return;
+ }
+ bcopy(src->v_path, buf, alloc);
+ mutex_exit(&src->v_lock);
+
+ mutex_enter(&dst->v_lock);
+ if (dst->v_path != NULL) {
+ mutex_exit(&dst->v_lock);
+ kmem_free(buf, alloc);
+ return;
+ }
+ dst->v_path = buf;
+ mutex_exit(&dst->v_lock);
+}
+
+/*
+ * XXX Private interface for segvn routines that handle vnode
+ * large page segments.
+ *
+ * return 1 if vp's file system VOP_PAGEIO() implementation
+ * can be safely used instead of VOP_GETPAGE() for handling
+ * pagefaults against regular non swap files. VOP_PAGEIO()
+ * interface is considered safe here if its implementation
+ * is very close to VOP_GETPAGE() implementation.
+ * e.g. It zero's out the part of the page beyond EOF. Doesn't
+ * panic if there're file holes but instead returns an error.
+ * Doesn't assume file won't be changed by user writes, etc.
+ *
+ * return 0 otherwise.
+ *
+ * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
+ */
+int
+vn_vmpss_usepageio(vnode_t *vp)
+{
+ vfs_t *vfsp = vp->v_vfsp;
+ char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
+ char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
+ char **fsok = pageio_ok_fss;
+
+ if (fsname == NULL) {
+ return (0);
+ }
+
+ for (; *fsok; fsok++) {
+ if (strcmp(*fsok, fsname) == 0) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/* VOP_XXX() macros call the corresponding fop_xxx() function */
+
+int
+fop_open(
+ vnode_t **vpp,
+ int mode,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int ret;
+ vnode_t *vp = *vpp;
+
+ VN_HOLD(vp);
+ /*
+ * Adding to the vnode counts before calling open
+ * avoids the need for a mutex. It circumvents a race
+ * condition where a query made on the vnode counts results in a
+ * false negative. The inquirer goes away believing the file is
+ * not open when there is an open on the file already under way.
+ *
+ * The counts are meant to prevent NFS from granting a delegation
+ * when it would be dangerous to do so.
+ *
+ * The vnode counts are only kept on regular files
+ */
+ if ((*vpp)->v_type == VREG) {
+ if (mode & FREAD)
+ atomic_add_32(&((*vpp)->v_rdcnt), 1);
+ if (mode & FWRITE)
+ atomic_add_32(&((*vpp)->v_wrcnt), 1);
+ }
+
+ VOPXID_MAP_CR(vp, cr);
+
+ ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
+
+ if (ret) {
+ /*
+ * Use the saved vp just in case the vnode ptr got trashed
+ * by the error.
+ */
+ VOPSTATS_UPDATE(vp, open);
+ if ((vp->v_type == VREG) && (mode & FREAD))
+ atomic_add_32(&(vp->v_rdcnt), -1);
+ if ((vp->v_type == VREG) && (mode & FWRITE))
+ atomic_add_32(&(vp->v_wrcnt), -1);
+ } else {
+ /*
+ * Some filesystems will return a different vnode,
+ * but the same path was still used to open it.
+ * So if we do change the vnode and need to
+ * copy over the path, do so here, rather than special
+ * casing each filesystem. Adjust the vnode counts to
+ * reflect the vnode switch.
+ */
+ VOPSTATS_UPDATE(*vpp, open);
+ if (*vpp != vp && *vpp != NULL) {
+ vn_copypath(vp, *vpp);
+ if (((*vpp)->v_type == VREG) && (mode & FREAD))
+ atomic_add_32(&((*vpp)->v_rdcnt), 1);
+ if ((vp->v_type == VREG) && (mode & FREAD))
+ atomic_add_32(&(vp->v_rdcnt), -1);
+ if (((*vpp)->v_type == VREG) && (mode & FWRITE))
+ atomic_add_32(&((*vpp)->v_wrcnt), 1);
+ if ((vp->v_type == VREG) && (mode & FWRITE))
+ atomic_add_32(&(vp->v_wrcnt), -1);
+ }
+ }
+ VN_RELE(vp);
+ return (ret);
+}
+
+int
+fop_close(
+ vnode_t *vp,
+ int flag,
+ int count,
+ offset_t offset,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
+ VOPSTATS_UPDATE(vp, close);
+ /*
+ * Check passed in count to handle possible dups. Vnode counts are only
+ * kept on regular files
+ */
+ if ((vp->v_type == VREG) && (count == 1)) {
+ if (flag & FREAD) {
+ ASSERT(vp->v_rdcnt > 0);
+ atomic_add_32(&(vp->v_rdcnt), -1);
+ }
+ if (flag & FWRITE) {
+ ASSERT(vp->v_wrcnt > 0);
+ atomic_add_32(&(vp->v_wrcnt), -1);
+ }
+ }
+ return (err);
+}
+
+int
+fop_read(
+ vnode_t *vp,
+ uio_t *uiop,
+ int ioflag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+ ssize_t resid_start = uiop->uio_resid;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
+ VOPSTATS_UPDATE_IO(vp, read,
+ read_bytes, (resid_start - uiop->uio_resid));
+ return (err);
+}
+
+int
+fop_write(
+ vnode_t *vp,
+ uio_t *uiop,
+ int ioflag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+ ssize_t resid_start = uiop->uio_resid;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
+ VOPSTATS_UPDATE_IO(vp, write,
+ write_bytes, (resid_start - uiop->uio_resid));
+ return (err);
+}
+
+int
+fop_ioctl(
+ vnode_t *vp,
+ int cmd,
+ intptr_t arg,
+ int flag,
+ cred_t *cr,
+ int *rvalp,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
+ VOPSTATS_UPDATE(vp, ioctl);
+ return (err);
+}
+
+int
+fop_setfl(
+ vnode_t *vp,
+ int oflags,
+ int nflags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
+ VOPSTATS_UPDATE(vp, setfl);
+ return (err);
+}
+
+int
+fop_getattr(
+ vnode_t *vp,
+ vattr_t *vap,
+ int flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ /*
+ * If this file system doesn't understand the xvattr extensions
+ * then turn off the xvattr bit.
+ */
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
+ vap->va_mask &= ~AT_XVATTR;
+ }
+
+ /*
+ * We're only allowed to skip the ACL check iff we used a 32 bit
+ * ACE mask with VOP_ACCESS() to determine permissions.
+ */
+ if ((flags & ATTR_NOACLCHECK) &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
+ return (EINVAL);
+ }
+ err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, getattr);
+ return (err);
+}
+
+int
+fop_setattr(
+ vnode_t *vp,
+ vattr_t *vap,
+ int flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ /*
+ * If this file system doesn't understand the xvattr extensions
+ * then turn off the xvattr bit.
+ */
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
+ vap->va_mask &= ~AT_XVATTR;
+ }
+
+ /*
+ * We're only allowed to skip the ACL check iff we used a 32 bit
+ * ACE mask with VOP_ACCESS() to determine permissions.
+ */
+ if ((flags & ATTR_NOACLCHECK) &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
+ return (EINVAL);
+ }
+ err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, setattr);
+ return (err);
+}
+
+int
+fop_access(
+ vnode_t *vp,
+ int mode,
+ int flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ if ((flags & V_ACE_MASK) &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
+ return (EINVAL);
+ }
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, access);
+ return (err);
+}
+
+int
+fop_lookup(
+ vnode_t *dvp,
+ char *nm,
+ vnode_t **vpp,
+ pathname_t *pnp,
+ int flags,
+ vnode_t *rdir,
+ cred_t *cr,
+ caller_context_t *ct,
+ int *deflags, /* Returned per-dirent flags */
+ pathname_t *ppnp) /* Returned case-preserved name in directory */
+{
+ int ret;
+
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly. It is required
+ * that if the vfs supports case-insensitive lookup, it also
+ * supports extended dirent flags.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
+ ret = xattr_dir_lookup(dvp, vpp, flags, cr);
+ } else {
+ ret = (*(dvp)->v_op->vop_lookup)
+ (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
+ }
+ if (ret == 0 && *vpp) {
+ VOPSTATS_UPDATE(*vpp, lookup);
+ if ((*vpp)->v_path == NULL) {
+ vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
+ }
+ }
+
+ return (ret);
+}
+
+int
+fop_create(
+ vnode_t *dvp,
+ char *name,
+ vattr_t *vap,
+ vcexcl_t excl,
+ int mode,
+ vnode_t **vpp,
+ cred_t *cr,
+ int flags,
+ caller_context_t *ct,
+ vsecattr_t *vsecp) /* ACL to set during create */
+{
+ int ret;
+
+ if (vsecp != NULL &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
+ return (EINVAL);
+ }
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ ret = (*(dvp)->v_op->vop_create)
+ (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
+ if (ret == 0 && *vpp) {
+ VOPSTATS_UPDATE(*vpp, create);
+ if ((*vpp)->v_path == NULL) {
+ vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
+ }
+ }
+
+ return (ret);
+}
+
+int
+fop_remove(
+ vnode_t *dvp,
+ char *nm,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
+ VOPSTATS_UPDATE(dvp, remove);
+ return (err);
+}
+
+int
+fop_link(
+ vnode_t *tdvp,
+ vnode_t *svp,
+ char *tnm,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+
+ /*
+ * If the target file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(tdvp, cr);
+
+ err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
+ VOPSTATS_UPDATE(tdvp, link);
+ return (err);
+}
+
+int
+fop_rename(
+ vnode_t *sdvp,
+ char *snm,
+ vnode_t *tdvp,
+ char *tnm,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+
+ /*
+ * If the file system involved does not support
+ * case-insensitive access and said access is requested, fail
+ * quickly.
+ */
+ if (flags & FIGNORECASE &&
+ ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(tdvp, cr);
+
+ err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
+ VOPSTATS_UPDATE(sdvp, rename);
+ return (err);
+}
+
+int
+fop_mkdir(
+ vnode_t *dvp,
+ char *dirname,
+ vattr_t *vap,
+ vnode_t **vpp,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags,
+ vsecattr_t *vsecp) /* ACL to set during create */
+{
+ int ret;
+
+ if (vsecp != NULL &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
+ return (EINVAL);
+ }
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ ret = (*(dvp)->v_op->vop_mkdir)
+ (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
+ if (ret == 0 && *vpp) {
+ VOPSTATS_UPDATE(*vpp, mkdir);
+ if ((*vpp)->v_path == NULL) {
+ vn_setpath(rootdir, dvp, *vpp, dirname,
+ strlen(dirname));
+ }
+ }
+
+ return (ret);
+}
+
+int
+fop_rmdir(
+ vnode_t *dvp,
+ char *nm,
+ vnode_t *cdir,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
+ VOPSTATS_UPDATE(dvp, rmdir);
+ return (err);
+}
+
+int
+fop_readdir(
+ vnode_t *vp,
+ uio_t *uiop,
+ cred_t *cr,
+ int *eofp,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+ ssize_t resid_start = uiop->uio_resid;
+
+ /*
+ * If this file system doesn't support retrieving directory
+ * entry flags and said access is requested, fail quickly.
+ */
+ if (flags & V_RDDIR_ENTFLAGS &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
+ return (EINVAL);
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
+ VOPSTATS_UPDATE_IO(vp, readdir,
+ readdir_bytes, (resid_start - uiop->uio_resid));
+ return (err);
+}
+
+int
+fop_symlink(
+ vnode_t *dvp,
+ char *linkname,
+ vattr_t *vap,
+ char *target,
+ cred_t *cr,
+ caller_context_t *ct,
+ int flags)
+{
+ int err;
+ xvattr_t xvattr;
+
+ /*
+ * If this file system doesn't support case-insensitive access
+ * and said access is requested, fail quickly.
+ */
+ if (flags & FIGNORECASE &&
+ (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
+ vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
+ return (EINVAL);
+
+ VOPXID_MAP_CR(dvp, cr);
+
+ /* check for reparse point */
+ if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
+ (strncmp(target, FS_REPARSE_TAG_STR,
+ strlen(FS_REPARSE_TAG_STR)) == 0)) {
+ if (!fs_reparse_mark(target, vap, &xvattr))
+ vap = (vattr_t *)&xvattr;
+ }
+
+ err = (*(dvp)->v_op->vop_symlink)
+ (dvp, linkname, vap, target, cr, ct, flags);
+ VOPSTATS_UPDATE(dvp, symlink);
+ return (err);
+}
+
+int
+fop_readlink(
+ vnode_t *vp,
+ uio_t *uiop,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, readlink);
+ return (err);
+}
+
+int
+fop_fsync(
+ vnode_t *vp,
+ int syncflag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
+ VOPSTATS_UPDATE(vp, fsync);
+ return (err);
+}
+
+void
+fop_inactive(
+ vnode_t *vp,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ /* Need to update stats before vop call since we may lose the vnode */
+ VOPSTATS_UPDATE(vp, inactive);
+
+ VOPXID_MAP_CR(vp, cr);
+
+ (*(vp)->v_op->vop_inactive)(vp, cr, ct);
+}
+
+int
+fop_fid(
+ vnode_t *vp,
+ fid_t *fidp,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
+ VOPSTATS_UPDATE(vp, fid);
+ return (err);
+}
+
+int
+fop_rwlock(
+ vnode_t *vp,
+ int write_lock,
+ caller_context_t *ct)
+{
+ int ret;
+
+ ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
+ VOPSTATS_UPDATE(vp, rwlock);
+ return (ret);
+}
+
+void
+fop_rwunlock(
+ vnode_t *vp,
+ int write_lock,
+ caller_context_t *ct)
+{
+ (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
+ VOPSTATS_UPDATE(vp, rwunlock);
+}
+
+int
+fop_seek(
+ vnode_t *vp,
+ offset_t ooff,
+ offset_t *noffp,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
+ VOPSTATS_UPDATE(vp, seek);
+ return (err);
+}
+
+int
+fop_cmp(
+ vnode_t *vp1,
+ vnode_t *vp2,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
+ VOPSTATS_UPDATE(vp1, cmp);
+ return (err);
+}
+
+int
+fop_frlock(
+ vnode_t *vp,
+ int cmd,
+ flock64_t *bfp,
+ int flag,
+ offset_t offset,
+ struct flk_callback *flk_cbp,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_frlock)
+ (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
+ VOPSTATS_UPDATE(vp, frlock);
+ return (err);
+}
+
+int
+fop_space(
+ vnode_t *vp,
+ int cmd,
+ flock64_t *bfp,
+ int flag,
+ offset_t offset,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
+ VOPSTATS_UPDATE(vp, space);
+ return (err);
+}
+
+int
+fop_realvp(
+ vnode_t *vp,
+ vnode_t **vpp,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
+ VOPSTATS_UPDATE(vp, realvp);
+ return (err);
+}
+
+int
+fop_getpage(
+ vnode_t *vp,
+ offset_t off,
+ size_t len,
+ uint_t *protp,
+ page_t **plarr,
+ size_t plsz,
+ struct seg *seg,
+ caddr_t addr,
+ enum seg_rw rw,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_getpage)
+ (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
+ VOPSTATS_UPDATE(vp, getpage);
+ return (err);
+}
+
+int
+fop_putpage(
+ vnode_t *vp,
+ offset_t off,
+ size_t len,
+ int flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, putpage);
+ return (err);
+}
+
+int
+fop_map(
+ vnode_t *vp,
+ offset_t off,
+ struct as *as,
+ caddr_t *addrp,
+ size_t len,
+ uchar_t prot,
+ uchar_t maxprot,
+ uint_t flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_map)
+ (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, map);
+ return (err);
+}
+
+int
+fop_addmap(
+ vnode_t *vp,
+ offset_t off,
+ struct as *as,
+ caddr_t addr,
+ size_t len,
+ uchar_t prot,
+ uchar_t maxprot,
+ uint_t flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+ u_longlong_t delta;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ error = (*(vp)->v_op->vop_addmap)
+ (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
+
+ if ((!error) && (vp->v_type == VREG)) {
+ delta = (u_longlong_t)btopr(len);
+ /*
+ * If file is declared MAP_PRIVATE, it can't be written back
+ * even if open for write. Handle as read.
+ */
+ if (flags & MAP_PRIVATE) {
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)delta);
+ } else {
+ /*
+ * atomic_add_64 forces the fetch of a 64 bit value to
+ * be atomic on 32 bit machines
+ */
+ if (maxprot & PROT_WRITE)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
+ (int64_t)delta);
+ if (maxprot & PROT_READ)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)delta);
+ if (maxprot & PROT_EXEC)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)delta);
+ }
+ }
+ VOPSTATS_UPDATE(vp, addmap);
+ return (error);
+}
+
+int
+fop_delmap(
+ vnode_t *vp,
+ offset_t off,
+ struct as *as,
+ caddr_t addr,
+ size_t len,
+ uint_t prot,
+ uint_t maxprot,
+ uint_t flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+ u_longlong_t delta;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ error = (*(vp)->v_op->vop_delmap)
+ (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
+
+ /*
+ * NFS calls into delmap twice, the first time
+ * it simply establishes a callback mechanism and returns EAGAIN
+ * while the real work is being done upon the second invocation.
+ * We have to detect this here and only decrement the counts upon
+ * the second delmap request.
+ */
+ if ((error != EAGAIN) && (vp->v_type == VREG)) {
+
+ delta = (u_longlong_t)btopr(len);
+
+ if (flags & MAP_PRIVATE) {
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)(-delta));
+ } else {
+ /*
+ * atomic_add_64 forces the fetch of a 64 bit value
+ * to be atomic on 32 bit machines
+ */
+ if (maxprot & PROT_WRITE)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
+ (int64_t)(-delta));
+ if (maxprot & PROT_READ)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)(-delta));
+ if (maxprot & PROT_EXEC)
+ atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
+ (int64_t)(-delta));
+ }
+ }
+ VOPSTATS_UPDATE(vp, delmap);
+ return (error);
+}
+
+
+int
+fop_poll(
+ vnode_t *vp,
+ short events,
+ int anyyet,
+ short *reventsp,
+ struct pollhead **phpp,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
+ VOPSTATS_UPDATE(vp, poll);
+ return (err);
+}
+
+int
+fop_dump(
+ vnode_t *vp,
+ caddr_t addr,
+ offset_t lbdn,
+ offset_t dblks,
+ caller_context_t *ct)
+{
+ int err;
+
+ /* ensure lbdn and dblks can be passed safely to bdev_dump */
+ if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
+ return (EIO);
+
+ err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
+ VOPSTATS_UPDATE(vp, dump);
+ return (err);
+}
+
+int
+fop_pathconf(
+ vnode_t *vp,
+ int cmd,
+ ulong_t *valp,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
+ VOPSTATS_UPDATE(vp, pathconf);
+ return (err);
+}
+
+int
+fop_pageio(
+ vnode_t *vp,
+ struct page *pp,
+ u_offset_t io_off,
+ size_t io_len,
+ int flags,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
+ VOPSTATS_UPDATE(vp, pageio);
+ return (err);
+}
+
+int
+fop_dumpctl(
+ vnode_t *vp,
+ int action,
+ offset_t *blkp,
+ caller_context_t *ct)
+{
+ int err;
+ err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
+ VOPSTATS_UPDATE(vp, dumpctl);
+ return (err);
+}
+
+void
+fop_dispose(
+ vnode_t *vp,
+ page_t *pp,
+ int flag,
+ int dn,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ /* Must do stats first since it's possible to lose the vnode */
+ VOPSTATS_UPDATE(vp, dispose);
+
+ VOPXID_MAP_CR(vp, cr);
+
+ (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
+}
+
+int
+fop_setsecattr(
+ vnode_t *vp,
+ vsecattr_t *vsap,
+ int flag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ /*
+ * We're only allowed to skip the ACL check iff we used a 32 bit
+ * ACE mask with VOP_ACCESS() to determine permissions.
+ */
+ if ((flag & ATTR_NOACLCHECK) &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
+ return (EINVAL);
+ }
+ err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
+ VOPSTATS_UPDATE(vp, setsecattr);
+ return (err);
+}
+
+int
+fop_getsecattr(
+ vnode_t *vp,
+ vsecattr_t *vsap,
+ int flag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ /*
+ * We're only allowed to skip the ACL check iff we used a 32 bit
+ * ACE mask with VOP_ACCESS() to determine permissions.
+ */
+ if ((flag & ATTR_NOACLCHECK) &&
+ vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
+ return (EINVAL);
+ }
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
+ VOPSTATS_UPDATE(vp, getsecattr);
+ return (err);
+}
+
+int
+fop_shrlock(
+ vnode_t *vp,
+ int cmd,
+ struct shrlock *shr,
+ int flag,
+ cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ VOPXID_MAP_CR(vp, cr);
+
+ err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
+ VOPSTATS_UPDATE(vp, shrlock);
+ return (err);
+}
+
+int
+fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
+ caller_context_t *ct)
+{
+ int err;
+
+ err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
+ VOPSTATS_UPDATE(vp, vnevent);
+ return (err);
+}
+
+int
+fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, reqzcbuf);
+ return (err);
+}
+
+int
+fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, retzcbuf);
+ return (err);
+}
+
+/*
+ * Default destructor
+ * Needed because NULL destructor means that the key is unused
+ */
+/* ARGSUSED */
+void
+vsd_defaultdestructor(void *value)
+{}
+
+/*
+ * Create a key (index into per vnode array)
+ * Locks out vsd_create, vsd_destroy, and vsd_free
+ * May allocate memory with lock held
+ */
+void
+vsd_create(uint_t *keyp, void (*destructor)(void *))
+{
+ int i;
+ uint_t nkeys;
+
+ /*
+ * if key is allocated, do nothing
+ */
+ mutex_enter(&vsd_lock);
+ if (*keyp) {
+ mutex_exit(&vsd_lock);
+ return;
+ }
+ /*
+ * find an unused key
+ */
+ if (destructor == NULL)
+ destructor = vsd_defaultdestructor;
+
+ for (i = 0; i < vsd_nkeys; ++i)
+ if (vsd_destructor[i] == NULL)
+ break;
+
+ /*
+ * if no unused keys, increase the size of the destructor array
+ */
+ if (i == vsd_nkeys) {
+ if ((nkeys = (vsd_nkeys << 1)) == 0)
+ nkeys = 1;
+ vsd_destructor =
+ (void (**)(void *))vsd_realloc((void *)vsd_destructor,
+ (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
+ (size_t)(nkeys * sizeof (void (*)(void *))));
+ vsd_nkeys = nkeys;
+ }
+
+ /*
+ * allocate the next available unused key
+ */
+ vsd_destructor[i] = destructor;
+ *keyp = i + 1;
+
+ /* create vsd_list, if it doesn't exist */
+ if (vsd_list == NULL) {
+ vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(vsd_list, sizeof (struct vsd_node),
+ offsetof(struct vsd_node, vs_nodes));
+ }
+
+ mutex_exit(&vsd_lock);
+}
+
+/*
+ * Destroy a key
+ *
+ * Assumes that the caller is preventing vsd_set and vsd_get
+ * Locks out vsd_create, vsd_destroy, and vsd_free
+ * May free memory with lock held
+ */
+void
+vsd_destroy(uint_t *keyp)
+{
+ uint_t key;
+ struct vsd_node *vsd;
+
+ /*
+ * protect the key namespace and our destructor lists
+ */
+ mutex_enter(&vsd_lock);
+ key = *keyp;
+ *keyp = 0;
+
+ ASSERT(key <= vsd_nkeys);
+
+ /*
+ * if the key is valid
+ */
+ if (key != 0) {
+ uint_t k = key - 1;
+ /*
+ * for every vnode with VSD, call key's destructor
+ */
+ for (vsd = list_head(vsd_list); vsd != NULL;
+ vsd = list_next(vsd_list, vsd)) {
+ /*
+ * no VSD for key in this vnode
+ */
+ if (key > vsd->vs_nkeys)
+ continue;
+ /*
+ * call destructor for key
+ */
+ if (vsd->vs_value[k] && vsd_destructor[k])
+ (*vsd_destructor[k])(vsd->vs_value[k]);
+ /*
+ * reset value for key
+ */
+ vsd->vs_value[k] = NULL;
+ }
+ /*
+ * actually free the key (NULL destructor == unused)
+ */
+ vsd_destructor[k] = NULL;
+ }
+
+ mutex_exit(&vsd_lock);
+}
+
+/*
+ * Quickly return the per vnode value that was stored with the specified key
+ * Assumes the caller is protecting key from vsd_create and vsd_destroy
+ * Assumes the caller is holding v_vsd_lock to protect the vsd.
+ */
+void *
+vsd_get(vnode_t *vp, uint_t key)
+{
+ struct vsd_node *vsd;
+
+ ASSERT(vp != NULL);
+ ASSERT(mutex_owned(&vp->v_vsd_lock));
+
+ vsd = vp->v_vsd;
+
+ if (key && vsd != NULL && key <= vsd->vs_nkeys)
+ return (vsd->vs_value[key - 1]);
+ return (NULL);
+}
+
+/*
+ * Set a per vnode value indexed with the specified key
+ * Assumes the caller is holding v_vsd_lock to protect the vsd.
+ */
+int
+vsd_set(vnode_t *vp, uint_t key, void *value)
+{
+ struct vsd_node *vsd;
+
+ ASSERT(vp != NULL);
+ ASSERT(mutex_owned(&vp->v_vsd_lock));
+
+ if (key == 0)
+ return (EINVAL);
+
+ vsd = vp->v_vsd;
+ if (vsd == NULL)
+ vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
+
+ /*
+ * If the vsd was just allocated, vs_nkeys will be 0, so the following
+ * code won't happen and we will continue down and allocate space for
+ * the vs_value array.
+ * If the caller is replacing one value with another, then it is up
+ * to the caller to free/rele/destroy the previous value (if needed).
+ */
+ if (key <= vsd->vs_nkeys) {
+ vsd->vs_value[key - 1] = value;
+ return (0);
+ }
+
+ ASSERT(key <= vsd_nkeys);
+
+ if (vsd->vs_nkeys == 0) {
+ mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
+ /*
+ * Link onto list of all VSD nodes.
+ */
+ list_insert_head(vsd_list, vsd);
+ mutex_exit(&vsd_lock);
+ }
+
+ /*
+ * Allocate vnode local storage and set the value for key
+ */
+ vsd->vs_value = vsd_realloc(vsd->vs_value,
+ vsd->vs_nkeys * sizeof (void *),
+ key * sizeof (void *));
+ vsd->vs_nkeys = key;
+ vsd->vs_value[key - 1] = value;
+
+ return (0);
+}
+
+/*
+ * Called from vn_free() to run the destructor function for each vsd
+ * Locks out vsd_create and vsd_destroy
+ * Assumes that the destructor *DOES NOT* use vsd
+ */
+void
+vsd_free(vnode_t *vp)
+{
+ int i;
+ struct vsd_node *vsd = vp->v_vsd;
+
+ if (vsd == NULL)
+ return;
+
+ if (vsd->vs_nkeys == 0) {
+ kmem_free(vsd, sizeof (*vsd));
+ vp->v_vsd = NULL;
+ return;
+ }
+
+ /*
+ * lock out vsd_create and vsd_destroy, call
+ * the destructor, and mark the value as destroyed.
+ */
+ mutex_enter(&vsd_lock);
+
+ for (i = 0; i < vsd->vs_nkeys; i++) {
+ if (vsd->vs_value[i] && vsd_destructor[i])
+ (*vsd_destructor[i])(vsd->vs_value[i]);
+ vsd->vs_value[i] = NULL;
+ }
+
+ /*
+ * remove from linked list of VSD nodes
+ */
+ list_remove(vsd_list, vsd);
+
+ mutex_exit(&vsd_lock);
+
+ /*
+ * free up the VSD
+ */
+ kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
+ kmem_free(vsd, sizeof (struct vsd_node));
+ vp->v_vsd = NULL;
+}
+
+/*
+ * realloc
+ */
+static void *
+vsd_realloc(void *old, size_t osize, size_t nsize)
+{
+ void *new;
+
+ new = kmem_zalloc(nsize, KM_SLEEP);
+ if (old) {
+ bcopy(old, new, osize);
+ kmem_free(old, osize);
+ }
+ return (new);
+}
+
+/*
+ * Setup the extensible system attribute for creating a reparse point.
+ * The symlink data 'target' is validated for proper format of a reparse
+ * string and a check also made to make sure the symlink data does not
+ * point to an existing file.
+ *
+ * return 0 if ok else -1.
+ */
+static int
+fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
+{
+ xoptattr_t *xoap;
+
+ if ((!target) || (!vap) || (!xvattr))
+ return (-1);
+
+ /* validate reparse string */
+ if (reparse_validate((const char *)target))
+ return (-1);
+
+ xva_init(xvattr);
+ xvattr->xva_vattr = *vap;
+ xvattr->xva_vattr.va_mask |= AT_XVATTR;
+ xoap = xva_getxoptattr(xvattr);
+ ASSERT(xoap);
+ XVA_SET_REQ(xvattr, XAT_REPARSE);
+ xoap->xoa_reparse = 1;
+
+ return (0);
+}
+
+/*
+ * Function to check whether a symlink is a reparse point.
+ * Return B_TRUE if it is a reparse point, else return B_FALSE
+ */
+boolean_t
+vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ xvattr_t xvattr;
+ xoptattr_t *xoap;
+
+ if ((vp->v_type != VLNK) ||
+ !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
+ return (B_FALSE);
+
+ xva_init(&xvattr);
+ xoap = xva_getxoptattr(&xvattr);
+ ASSERT(xoap);
+ XVA_SET_REQ(&xvattr, XAT_REPARSE);
+
+ if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
+ return (B_FALSE);
+
+ if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
+ (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
+ return (B_FALSE);
+
+ return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
+}
diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c
new file mode 100644
index 000000000000..a82718e8bc6e
--- /dev/null
+++ b/uts/common/fs/zfs/arc.c
@@ -0,0 +1,4658 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory pressure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() interface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2. We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()). Note however that the data associated
+ * with the buffer may be evicted prior to the callback. The callback
+ * must be made with *no locks held* (to prevent deadlock). Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <vm/anon.h>
+#include <sys/fs/swapnode.h>
+#include <sys/dnlc.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <zfs_fletcher.h>
+
+static kmutex_t arc_reclaim_thr_lock;
+static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
+static uint8_t arc_thread_exit;
+
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
+#define ARC_REDUCE_DNLC_PERCENT 3
+uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+
+typedef enum arc_reclaim_strategy {
+ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
+ ARC_RECLAIM_CONS /* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int arc_grow_retry = 60;
+
+/* shift of arc_c for calculating both min and max arc_p */
+static int arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+static int arc_shrink_shift = 5;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_lifespan;
+
+static int arc_dead;
+
+/*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
+ * These tunables are for performance analysis.
+ */
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
+
+/*
+ * Note that buffers can be in one of 6 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
+ */
+
+typedef struct arc_state {
+ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
+ uint64_t arcs_size; /* total amount of data in this state */
+ kmutex_t arcs_mtx;
+} arc_state_t;
+
+/* The 6 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_recycle_miss;
+ kstat_named_t arcstat_mutex_miss;
+ kstat_named_t arcstat_evict_skip;
+ kstat_named_t arcstat_evict_l2_cached;
+ kstat_named_t arcstat_evict_l2_eligible;
+ kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_data_size;
+ kstat_named_t arcstat_other_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_read_bytes;
+ kstat_named_t arcstat_l2_write_bytes;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_memory_throttle_count;
+} arc_stats_t;
+
+static arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "recycle_miss", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_l2_cached", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "data_size", KSTAT_DATA_UINT64 },
+ { "other_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_read_bytes", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 }
+};
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val));
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu;
+static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+
+static int arc_no_grow; /* Don't try to grow cache size */
+static uint64_t arc_tempreserve;
+static uint64_t arc_loaned_bytes;
+static uint64_t arc_meta_used;
+static uint64_t arc_meta_limit;
+static uint64_t arc_meta_max = 0;
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_done_func_t *acb_done;
+ arc_buf_t *acb_buf;
+ zio_t *acb_zio_dummy;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ uint64_t b_cksum0;
+
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+ void *b_thawed;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_buf_t *b_buf;
+ uint32_t b_flags;
+ uint32_t b_datacnt;
+
+ arc_callback_t *b_acb;
+ kcondvar_t b_cv;
+
+ /* immutable */
+ arc_buf_contents_t b_type;
+ uint64_t b_size;
+ uint64_t b_spa;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ list_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
+};
+
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static arc_buf_hdr_t arc_eviction_hdr;
+static void arc_get_data_buf(arc_buf_t *buf);
+static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
+
+/*
+ * Private ARC flags. These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read. However, these flags
+ * should never be passed and should only be set by ARC code. When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
+#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
+#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
+#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
+#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
+ (hdr)->b_l2hdr != NULL)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD 64
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 2 /* num of writes */
+#define L2ARC_FEED_SECS 1 /* caching interval secs */
+#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
+boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_boost; /* warmup write boost, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
+static uint64_t
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
+{
+ uint8_t *vdva = (uint8_t *)dva;
+ uint64_t crc = -1ULL;
+ int i;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ for (i = 0; i < sizeof (dva_t); i++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+ crc ^= (spa>>8) ^ birth;
+
+ return (crc);
+}
+
+#define BUF_EMPTY(buf) \
+ ((buf)->b_dva.dva_word[0] == 0 && \
+ (buf)->b_dva.dva_word[1] == 0 && \
+ (buf)->b_birth == 0)
+
+#define BUF_EQUAL(spa, dva, birth, buf) \
+ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+}
+
+static arc_buf_hdr_t *
+buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *buf;
+
+ mutex_enter(hash_lock);
+ for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+ buf = buf->b_hash_next) {
+ if (BUF_EQUAL(spa, dva, birth, buf)) {
+ *lockp = hash_lock;
+ return (buf);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fbuf;
+ uint32_t i;
+
+ ASSERT(!HDR_IN_HASH_TABLE(buf));
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+ fbuf = fbuf->b_hash_next, i++) {
+ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+ return (fbuf);
+ }
+
+ buf->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = buf;
+ buf->b_flags |= ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+ arc_buf_hdr_t *fbuf, **bufp;
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(buf));
+
+ bufp = &buf_hash_table.ht_table[idx];
+ while ((fbuf = *bufp) != buf) {
+ ASSERT(fbuf != NULL);
+ bufp = &fbuf->b_hash_next;
+ }
+ *bufp = buf->b_hash_next;
+ buf->b_hash_next = NULL;
+ buf->b_flags &= ~ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_hdr_t));
+ refcount_create(&buf->b_refcnt);
+ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_t));
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
+ arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ ASSERT(BUF_EMPTY(buf));
+ refcount_destroy(&buf->b_refcnt);
+ cv_destroy(&buf->b_cv);
+ mutex_destroy(&buf->b_freeze_lock);
+ arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+ arc_buf_t *buf = vbuf;
+
+ mutex_destroy(&buf->b_evict_lock);
+ rw_destroy(&buf->b_data_lock);
+ arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+}
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini().
+ */
+ if (!arc_dead)
+ cv_signal(&arc_reclaim_thr_cv);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 64K block size. The table will take up
+ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+ */
+ while (hsize * 65536 < physmem * PAGESIZE)
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+ 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL ||
+ (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
+static void
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+{
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
+ buf->b_hdr->b_freeze_cksum);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ buf->b_hdr->b_freeze_cksum = NULL;
+ }
+
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_thawed)
+ kmem_free(buf->b_hdr->b_thawed, 1);
+ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ kmutex_t *hash_lock;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
+ ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
+ buf->b_hdr->b_state == arc_anon);
+ arc_cksum_compute(buf, B_FALSE);
+ mutex_exit(hash_lock);
+}
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+ (ab->b_state != arc_anon)) {
+ uint64_t delta = ab->b_size * ab->b_datacnt;
+ list_t *list = &ab->b_state->arcs_list[ab->b_type];
+ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
+ mutex_enter(&ab->b_state->arcs_mtx);
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(list, ab);
+ if (GHOST_STATE(ab->b_state)) {
+ ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT3P(ab->b_buf, ==, NULL);
+ delta = ab->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
+ mutex_exit(&ab->b_state->arcs_mtx);
+ /* remove the prefetch flag if we get a reference */
+ if (ab->b_flags & ARC_PREFETCH)
+ ab->b_flags &= ~ARC_PREFETCH;
+ }
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = ab->b_state;
+
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ uint64_t *size = &state->arcs_lsize[ab->b_type];
+
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(!list_link_active(&ab->b_arc_node));
+ list_insert_head(&state->arcs_list[ab->b_type], ab);
+ ASSERT(ab->b_datacnt > 0);
+ atomic_add_64(size, ab->b_size * ab->b_datacnt);
+ mutex_exit(&state->arcs_mtx);
+ }
+ return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+{
+ arc_state_t *old_state = ab->b_state;
+ int64_t refcnt = refcount_count(&ab->b_refcnt);
+ uint64_t from_delta, to_delta;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(new_state != old_state);
+ ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
+
+ from_delta = to_delta = ab->b_datacnt * ab->b_size;
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+
+ if (use_mutex)
+ mutex_enter(&old_state->arcs_mtx);
+
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&old_state->arcs_list[ab->b_type], ab);
+
+ /*
+ * If prefetching out of the ghost cache,
+ * we will have a non-zero datacnt.
+ */
+ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+ /* ghost elements have a ghost size */
+ ASSERT(ab->b_buf == NULL);
+ from_delta = ab->b_size;
+ }
+ ASSERT3U(*size, >=, from_delta);
+ atomic_add_64(size, -from_delta);
+
+ if (use_mutex)
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ if (new_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+
+ if (use_mutex)
+ mutex_enter(&new_state->arcs_mtx);
+
+ list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(new_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ to_delta = ab->b_size;
+ }
+ atomic_add_64(size, to_delta);
+
+ if (use_mutex)
+ mutex_exit(&new_state->arcs_mtx);
+ }
+ }
+
+ ASSERT(!BUF_EMPTY(ab));
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
+
+ /* adjust state sizes */
+ if (to_delta)
+ atomic_add_64(&new_state->arcs_size, to_delta);
+ if (from_delta) {
+ ASSERT3U(old_state->arcs_size, >=, from_delta);
+ atomic_add_64(&old_state->arcs_size, -from_delta);
+ }
+ ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ case ARC_SPACE_DATA:
+ ARCSTAT_INCR(arcstat_data_size, space);
+ break;
+ case ARC_SPACE_OTHER:
+ ARCSTAT_INCR(arcstat_other_size, space);
+ break;
+ case ARC_SPACE_HDRS:
+ ARCSTAT_INCR(arcstat_hdr_size, space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ ARCSTAT_INCR(arcstat_l2_hdr_size, space);
+ break;
+ }
+
+ atomic_add_64(&arc_meta_used, space);
+ atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space, arc_space_type_t type)
+{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ case ARC_SPACE_DATA:
+ ARCSTAT_INCR(arcstat_data_size, -space);
+ break;
+ case ARC_SPACE_OTHER:
+ ARCSTAT_INCR(arcstat_other_size, -space);
+ break;
+ case ARC_SPACE_HDRS:
+ ARCSTAT_INCR(arcstat_hdr_size, -space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
+ break;
+ }
+
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ atomic_add_64(&arc_meta_used, -space);
+ ASSERT(arc_size >= space);
+ atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+ if (arc_evict_needed(ARC_BUFC_DATA))
+ cv_signal(&arc_reclaim_thr_cv);
+ atomic_add_64(&arc_size, size);
+ return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+ zio_data_buf_free(buf, size);
+ ASSERT(arc_size >= size);
+ atomic_add_64(&arc_size, -size);
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = spa_guid(spa);
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ hdr->b_datacnt = 1;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ (void) refcount_add(&hdr->b_refcnt, tag);
+
+ return (buf);
+}
+
+static char *arc_onloan_tag = "onloan";
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, int size)
+{
+ arc_buf_t *buf;
+
+ buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+
+ atomic_add_64(&arc_loaned_bytes, size);
+ return (buf);
+}
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(buf->b_data != NULL);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+
+ atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
+{
+ arc_buf_t *buf;
+ arc_buf_hdr_t *hdr = from->b_hdr;
+ uint64_t size = hdr->b_size;
+
+ ASSERT(hdr->b_state != arc_anon);
+
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ bcopy(from->b_data, buf->b_data, size);
+ hdr->b_datacnt += 1;
+ return (buf);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+
+ /*
+ * Check to see if this buffer is evicted. Callers
+ * must verify b_data != NULL to know if the add_ref
+ * was successful.
+ */
+ mutex_enter(&buf->b_evict_lock);
+ if (buf->b_data == NULL) {
+ mutex_exit(&buf->b_evict_lock);
+ return;
+ }
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ mutex_exit(&buf->b_evict_lock);
+
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ add_reference(hdr, hash_lock, tag);
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+}
+
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+{
+ arc_buf_t **bufp;
+
+ /* free up data associated with the buf */
+ if (buf->b_data) {
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_cksum_verify(buf);
+
+ if (!recycle) {
+ if (type == ARC_BUFC_METADATA) {
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
+ arc_space_return(size, ARC_SPACE_DATA);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
+ ARCSTAT_INCR(arcstat_data_size, -size);
+ atomic_add_64(&arc_size, -size);
+ }
+ }
+ if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ uint64_t *cnt = &state->arcs_lsize[type];
+
+ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(state != arc_anon);
+
+ ASSERT3U(*cnt, >=, size);
+ atomic_add_64(cnt, -size);
+ }
+ ASSERT3U(state->arcs_size, >=, size);
+ atomic_add_64(&state->arcs_size, -size);
+ buf->b_data = NULL;
+ ASSERT(buf->b_hdr->b_datacnt > 0);
+ buf->b_hdr->b_datacnt -= 1;
+ }
+
+ /* only remove the buf if requested */
+ if (!all)
+ return;
+
+ /* remove the buf from the hdr list */
+ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ continue;
+ *bufp = buf->b_next;
+ buf->b_next = NULL;
+
+ ASSERT(buf->b_efunc == NULL);
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
+ if (l2hdr != NULL) {
+ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ if (!buflist_held) {
+ mutex_enter(&l2arc_buflist_mtx);
+ l2hdr = hdr->b_l2hdr;
+ }
+
+ if (l2hdr != NULL) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!buflist_held)
+ mutex_exit(&l2arc_buflist_mtx);
+ }
+
+ if (!BUF_EMPTY(hdr)) {
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ buf_discard_identity(hdr);
+ }
+ while (hdr->b_buf) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&buf->b_evict_lock);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
+ hdr->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&buf->b_evict_lock);
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+ }
+ }
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ if (hdr->b_thawed) {
+ kmem_free(hdr->b_thawed, 1);
+ hdr->b_thawed = NULL;
+ }
+
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ ASSERT3P(hdr->b_acb, ==, NULL);
+ kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ int hashed = hdr->b_state != arc_anon;
+
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(buf->b_data != NULL);
+
+ if (hashed) {
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else {
+ ASSERT(buf == hdr->b_buf);
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ mutex_exit(hash_lock);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ int destroy_hdr;
+ /*
+ * We are in the middle of an async write. Don't destroy
+ * this buffer unless the write completes before we finish
+ * decrementing the reference count.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ (void) remove_reference(hdr, NULL, tag);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ if (remove_reference(hdr, NULL, tag) > 0)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ else
+ arc_hdr_destroy(hdr);
+ }
+}
+
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int no_callback = (buf->b_efunc == NULL);
+
+ if (hdr->b_state == arc_anon) {
+ ASSERT(hdr->b_datacnt == 1);
+ arc_buf_free(buf, tag);
+ return (no_callback);
+ }
+
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT(hdr->b_state != arc_anon);
+ ASSERT(buf->b_data != NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ if (no_callback)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else if (no_callback) {
+ ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ ASSERT(no_callback || hdr->b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_refcnt));
+ mutex_exit(hash_lock);
+ return (no_callback);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes. Move the removed buffers to the appropriate evict state.
+ * If the recycle flag is set, then attempt to "recycle" a buffer:
+ * - look for a buffer to evict that is `bytes' long.
+ * - return the data block from this buffer rather than freeing it.
+ * This flag is used by callers that are trying to make space for a
+ * new buffer in a full arc cache.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ */
+static void *
+arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
+ arc_buf_contents_t type)
+{
+ arc_state_t *evicted_state;
+ uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ arc_buf_hdr_t *ab, *ab_prev = NULL;
+ list_t *list = &state->arcs_list[type];
+ kmutex_t *hash_lock;
+ boolean_t have_lock;
+ void *stolen = NULL;
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(ab) ||
+ (spa && ab->b_spa != spa) ||
+ (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
+ ddi_get_lbolt() - ab->b_arc_access <
+ arc_min_prefetch_lifespan)) {
+ skipped++;
+ continue;
+ }
+ /* "lookahead" for better eviction candidate */
+ if (recycle && ab->b_size != bytes &&
+ ab_prev && ab_prev->b_size == bytes)
+ continue;
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (have_lock || mutex_tryenter(hash_lock)) {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT(ab->b_datacnt > 0);
+ while (ab->b_buf) {
+ arc_buf_t *buf = ab->b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ missed += 1;
+ break;
+ }
+ if (buf->b_data) {
+ bytes_evicted += ab->b_size;
+ if (recycle && ab->b_type == type &&
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
+ stolen = buf->b_data;
+ recycle = FALSE;
+ }
+ }
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, FALSE);
+ ab->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ mutex_exit(&buf->b_evict_lock);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, TRUE);
+ }
+ }
+
+ if (ab->b_l2hdr) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached,
+ ab->b_size);
+ } else {
+ if (l2arc_write_eligible(ab->b_spa, ab)) {
+ ARCSTAT_INCR(arcstat_evict_l2_eligible,
+ ab->b_size);
+ } else {
+ ARCSTAT_INCR(
+ arcstat_evict_l2_ineligible,
+ ab->b_size);
+ }
+ }
+
+ if (ab->b_datacnt == 0) {
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ }
+ if (!have_lock)
+ mutex_exit(hash_lock);
+ if (bytes >= 0 && bytes_evicted >= bytes)
+ break;
+ } else {
+ missed += 1;
+ }
+ }
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&state->arcs_mtx);
+
+ if (bytes_evicted < bytes)
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+
+ if (skipped)
+ ARCSTAT_INCR(arcstat_evict_skip, skipped);
+
+ if (missed)
+ ARCSTAT_INCR(arcstat_mutex_miss, missed);
+
+ /*
+ * We have just evicted some date into the ghost state, make
+ * sure we also adjust the ghost state size if necessary.
+ */
+ if (arc_no_grow &&
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+ arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete =
+ MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+ arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+ arc_mru_ghost->arcs_size +
+ arc_mfu_ghost->arcs_size - arc_c);
+ arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ }
+ }
+
+ return (stolen);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes. Destroy the buffers that are removed.
+ */
+static void
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+{
+ arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t marker = { 0 };
+ list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+ kmutex_t *hash_lock;
+ uint64_t bytes_deleted = 0;
+ uint64_t bufs_skipped = 0;
+
+ ASSERT(GHOST_STATE(state));
+top:
+ mutex_enter(&state->arcs_mtx);
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+ if (spa && ab->b_spa != spa)
+ continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
+ hash_lock = HDR_LOCK(ab);
+ /* caller may be trying to modify this buffer, skip it */
+ if (MUTEX_HELD(hash_lock))
+ continue;
+ if (mutex_tryenter(hash_lock)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(ab));
+ ASSERT(ab->b_buf == NULL);
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_deleted += ab->b_size;
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+ if (bytes >= 0 && bytes_deleted >= bytes)
+ break;
+ } else if (bytes < 0) {
+ /*
+ * Insert a list marker and then wait for the
+ * hash lock to become available. Once its
+ * available, restart from where we left off.
+ */
+ list_insert_after(list, ab, &marker);
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ mutex_enter(&state->arcs_mtx);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ } else
+ bufs_skipped += 1;
+ }
+ mutex_exit(&state->arcs_mtx);
+
+ if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ (bytes < 0 || bytes_deleted < bytes)) {
+ list = &state->arcs_list[ARC_BUFC_METADATA];
+ goto top;
+ }
+
+ if (bufs_skipped) {
+ ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
+ ASSERT(bytes >= 0);
+ }
+
+ if (bytes_deleted < bytes)
+ dprintf("only deleted %lld bytes from %p",
+ (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+ int64_t adjustment, delta;
+
+ /*
+ * Adjust MRU size
+ */
+
+ adjustment = MIN((int64_t)(arc_size - arc_c),
+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+ arc_p));
+
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
+ (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+ adjustment -= delta;
+ }
+
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ (void) arc_evict(arc_mru, NULL, delta, FALSE,
+ ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Adjust MFU size
+ */
+
+ adjustment = arc_size - arc_c;
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
+ (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+ adjustment -= delta;
+ }
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t delta = MIN(adjustment,
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
+ (void) arc_evict(arc_mfu, NULL, delta, FALSE,
+ ARC_BUFC_METADATA);
+ }
+
+ /*
+ * Adjust ghost lists
+ */
+
+ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+
+ if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
+ delta = MIN(arc_mru_ghost->arcs_size, adjustment);
+ arc_evict_ghost(arc_mru_ghost, NULL, delta);
+ }
+
+ adjustment =
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+
+ if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
+ delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
+ arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+ }
+}
+
+static void
+arc_do_user_evicts(void)
+{
+ mutex_enter(&arc_eviction_mtx);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
+ mutex_enter(&buf->b_evict_lock);
+ buf->b_hdr = NULL;
+ mutex_exit(&buf->b_evict_lock);
+ mutex_exit(&arc_eviction_mtx);
+
+ if (buf->b_efunc != NULL)
+ VERIFY(buf->b_efunc(buf) == 0);
+
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_eviction_mtx);
+ }
+ mutex_exit(&arc_eviction_mtx);
+}
+
+/*
+ * Flush all *evictable* data from the cache for the given spa.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(spa_t *spa)
+{
+ uint64_t guid = 0;
+
+ if (spa)
+ guid = spa_guid(spa);
+
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
+ if (spa)
+ break;
+ }
+ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
+ if (spa)
+ break;
+ }
+
+ arc_evict_ghost(arc_mru_ghost, guid, -1);
+ arc_evict_ghost(arc_mfu_ghost, guid, -1);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_do_user_evicts();
+ mutex_exit(&arc_reclaim_thr_lock);
+ ASSERT(spa || arc_eviction_list == NULL);
+}
+
+void
+arc_shrink(void)
+{
+ if (arc_c > arc_c_min) {
+ uint64_t to_free;
+
+#ifdef _KERNEL
+ to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
+#else
+ to_free = arc_c >> arc_shrink_shift;
+#endif
+ if (arc_c > arc_c_min + to_free)
+ atomic_add_64(&arc_c, -to_free);
+ else
+ arc_c = arc_c_min;
+
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (arc_c > arc_size)
+ arc_c = MAX(arc_size, arc_c_min);
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ }
+
+ if (arc_size > arc_c)
+ arc_adjust();
+}
+
+static int
+arc_reclaim_needed(void)
+{
+ uint64_t extra;
+
+#ifdef _KERNEL
+
+ if (needfree)
+ return (1);
+
+ /*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+ return (1);
+
+#if defined(__i386)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
+ * free)
+ */
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+ return (1);
+#endif
+
+#else
+ if (spa_get_random(100) == 0)
+ return (1);
+#endif
+ return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
+
+#ifdef _KERNEL
+ if (arc_meta_used >= arc_meta_limit) {
+ /*
+ * We are exceeding our meta-data cache limit.
+ * Purge some DNLC entries to release holds on meta-data.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ }
+#if defined(__i386)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ /*
+ * An aggressive reclamation will shrink the cache size as well as
+ * reap free buffers from the arc kmem caches.
+ */
+ if (strat == ARC_RECLAIM_AGGR)
+ arc_shrink();
+
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
+ }
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_cache);
+}
+
+static void
+arc_reclaim_thread(void)
+{
+ clock_t growtime = 0;
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ while (arc_thread_exit == 0) {
+ if (arc_reclaim_needed()) {
+
+ if (arc_no_grow) {
+ if (last_reclaim == ARC_RECLAIM_CONS) {
+ last_reclaim = ARC_RECLAIM_AGGR;
+ } else {
+ last_reclaim = ARC_RECLAIM_CONS;
+ }
+ } else {
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ membar_producer();
+ }
+
+ /* reset the growth delay for every reclaim */
+ growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+
+ arc_kmem_reap_now(last_reclaim);
+ arc_warm = B_TRUE;
+
+ } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+ arc_no_grow = FALSE;
+ }
+
+ arc_adjust();
+
+ if (arc_eviction_list != NULL)
+ arc_do_user_evicts();
+
+ /* block until needed, or one second, whichever is shorter */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thr_cv,
+ &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ }
+
+ arc_thread_exit = 0;
+ cv_broadcast(&arc_reclaim_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+ uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+
+ if (state == arc_l2c_only)
+ return;
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
+ 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
+
+ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ uint64_t delta;
+
+ mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
+ 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = MIN(mult, 10);
+
+ delta = MIN(bytes * mult, arc_p);
+ arc_p = MAX(arc_p_min, arc_p - delta);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thr_cv);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
+ */
+static int
+arc_evict_needed(arc_buf_contents_t type)
+{
+ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+ return (1);
+
+#ifdef _KERNEL
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this area remains
+ * above about 1/32nd free.
+ */
+ if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+ vmem_size(zio_arena, VMEM_FREE) <
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+ return (1);
+#endif
+
+ if (arc_reclaim_needed())
+ return (1);
+
+ return (arc_size > arc_c);
+}
+
+/*
+ * The buffer, supplied as the first argument, needs a data block.
+ * So, if we are at cache max, determine which cache should be victimized.
+ * We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead. Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU. In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
+ * MFU's resident set is consuming more space than it has been allotted. In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_get_data_buf(arc_buf_t *buf)
+{
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_adapt(size, state);
+
+ /*
+ * We have not yet reached cache maximum size,
+ * just allocate a new buffer.
+ */
+ if (!arc_evict_needed(type)) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ ARCSTAT_INCR(arcstat_data_size, size);
+ atomic_add_64(&arc_size, size);
+ }
+ goto out;
+ }
+
+ /*
+ * If we are prefetching from the mfu ghost list, this buffer
+ * will end up on the mru list; so steal space from there.
+ */
+ if (state == arc_mfu_ghost)
+ state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
+ else if (state == arc_mru_ghost)
+ state = arc_mru;
+
+ if (state == arc_mru || state == arc_anon) {
+ uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
+ state = (arc_mfu->arcs_lsize[type] >= size &&
+ arc_p > mru_used) ? arc_mfu : arc_mru;
+ } else {
+ /* MFU cases */
+ uint64_t mfu_space = arc_c - arc_p;
+ state = (arc_mru->arcs_lsize[type] >= size &&
+ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ }
+ if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ ARCSTAT_INCR(arcstat_data_size, size);
+ atomic_add_64(&arc_size, size);
+ }
+ ARCSTAT_BUMP(arcstat_recycle_miss);
+ }
+ ASSERT(buf->b_data != NULL);
+out:
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ atomic_add_64(&hdr->b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+ }
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+ clock_t now;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if (buf->b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT(buf->b_arc_access == 0);
+ buf->b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mru, buf, hash_lock);
+
+ } else if (buf->b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ if (refcount_count(&buf->b_refcnt) == 0) {
+ ASSERT(list_link_active(&buf->b_arc_node));
+ } else {
+ buf->b_flags &= ~ARC_PREFETCH;
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ }
+ buf->b_arc_access = now;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (now > buf->b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ buf->b_arc_access = now;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ }
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (buf->b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ new_state = arc_mru;
+ if (refcount_count(&buf->b_refcnt) > 0)
+ buf->b_flags &= ~ARC_PREFETCH;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ }
+
+ buf->b_arc_access = ddi_get_lbolt();
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (buf->b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ ASSERT(refcount_count(&buf->b_refcnt) == 0);
+ ASSERT(list_link_active(&buf->b_arc_node));
+ }
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ buf->b_arc_access = ddi_get_lbolt();
+ } else if (buf->b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ new_state = arc_mru;
+ }
+
+ buf->b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ if (zio == NULL || zio->io_error == 0)
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+}
+
+/* a generic arc_done_func_t */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (zio && zio->io_error) {
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ *bufp = NULL;
+ } else {
+ *bufp = buf;
+ ASSERT(buf->b_data);
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr, *found;
+ arc_buf_t *buf;
+ arc_buf_t *abuf; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock;
+ arc_callback_t *callback_list, *acb;
+ int freeable = FALSE;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~ARC_L2_EVICTED;
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags &= ~ARC_L2CACHE;
+
+ /* byteswap if necessary */
+ callback_list = hdr->b_acb;
+ ASSERT(callback_list != NULL);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
+ arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+ byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+ func(buf->b_data, hdr->b_size);
+ }
+
+ arc_cksum_compute(buf, B_FALSE);
+
+ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
+ /* create copies of the data buffer for the callers */
+ abuf = buf;
+ for (acb = callback_list; acb; acb = acb->acb_next) {
+ if (acb->acb_done) {
+ if (abuf == NULL)
+ abuf = arc_buf_clone(buf);
+ acb->acb_buf = abuf;
+ abuf = NULL;
+ }
+ }
+ hdr->b_acb = NULL;
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ ASSERT(!HDR_BUF_AVAILABLE(hdr));
+ if (abuf == buf) {
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(hdr->b_datacnt == 1);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+ if (zio->io_error != 0) {
+ hdr->b_flags |= ARC_IO_ERROR;
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_cv);
+
+ if (hash_lock) {
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done)
+ acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp. But if you know you don't need locking, you can use
+ * arc_read_bp.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ int err;
+
+ if (pbuf == NULL) {
+ /*
+ * XXX This happens from traverse callback funcs, for
+ * the objset_phys_t block.
+ */
+ return (arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb));
+ }
+
+ ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+ ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+ rw_enter(&pbuf->b_data_lock, RW_READER);
+
+ err = arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb);
+ rw_exit(&pbuf->b_data_lock);
+
+ return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ zio_t *rzio;
+ uint64_t guid = spa_guid(spa);
+
+top:
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
+ if (hdr && hdr->b_datacnt > 0) {
+
+ *arc_flags |= ARC_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+
+ if (*arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, NULL, zio_flags);
+
+ ASSERT(acb->acb_done != NULL);
+ acb->acb_next = hdr->b_acb;
+ hdr->b_acb = acb;
+ add_reference(hdr, hash_lock, private);
+ mutex_exit(hash_lock);
+ return (0);
+ }
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ if (done) {
+ add_reference(hdr, hash_lock, private);
+ /*
+ * If this block is already in use, create a new
+ * copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
+ buf = hdr->b_buf;
+ ASSERT(buf);
+ ASSERT(buf->b_data);
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ } else {
+ buf = arc_buf_clone(buf);
+ }
+
+ } else if (*arc_flags & ARC_PREFETCH &&
+ refcount_count(&hdr->b_refcnt) == 0) {
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, buf, private);
+ } else {
+ uint64_t size = BP_GET_LSIZE(bp);
+ arc_callback_t *acb;
+ vdev_t *vd = NULL;
+ uint64_t addr;
+ boolean_t devw = B_FALSE;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ buf = arc_buf_alloc(spa, size, private, type);
+ hdr = buf->b_hdr;
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ buf_discard_identity(hdr);
+ (void) arc_buf_remove_ref(buf, private);
+ goto top; /* restart the IO request */
+ }
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ if (BP_GET_LEVEL(bp) > 0)
+ hdr->b_flags |= ARC_INDIRECT;
+ } else {
+ /* this block is in the ghost cache */
+ ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT(hdr->b_buf == NULL);
+
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ else
+ add_reference(hdr, hash_lock, private);
+ if (*arc_flags & ARC_L2CACHE)
+ hdr->b_flags |= ARC_L2CACHE;
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ ASSERT(hdr->b_datacnt == 0);
+ hdr->b_datacnt = 1;
+ arc_get_data_buf(buf);
+ arc_access(hdr, hash_lock);
+ }
+
+ ASSERT(!GHOST_STATE(hdr->b_state));
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+
+ ASSERT(hdr->b_acb == NULL);
+ hdr->b_acb = acb;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+ if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
+ (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr->b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr->b_daddr;
+ /*
+ * Lock out device removal.
+ */
+ if (vdev_is_dead(vd) ||
+ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+ vd = NULL;
+ }
+
+ mutex_exit(hash_lock);
+
+ ASSERT3U(hdr->b_size, ==, size);
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+ uint64_t, size, zbookmark_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, misses);
+
+ if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. The L2ARC vdev was previously cached.
+ * 2. This buffer still has L2ARC metadata.
+ * 3. This buffer isn't currently writing to the L2ARC.
+ * 4. The L2ARC entry wasn't evicted, which may
+ * also have invalidated the vdev.
+ * 5. This isn't prefetch and l2arc_noprefetch is set.
+ */
+ if (hdr->b_l2hdr != NULL &&
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+ !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = zio_flags;
+
+ /*
+ * l2arc read. The SCL_L2ARC lock will be
+ * released by l2arc_read_done().
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority, zio_flags |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+ ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+
+ if (*arc_flags & ARC_NOWAIT) {
+ zio_nowait(rzio);
+ return (0);
+ }
+
+ ASSERT(*arc_flags & ARC_WAIT);
+ if (zio_wait(rzio) == 0)
+ return (0);
+
+ /* l2arc read error; goto zio_read() */
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ }
+ } else {
+ if (vd != NULL)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ if (l2arc_ndev != 0) {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ }
+ }
+
+ rzio = zio_read(pio, spa, bp, buf->b_data, size,
+ arc_read_done, buf, priority, zio_flags, zb);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+ ASSERT(buf->b_hdr != NULL);
+ ASSERT(buf->b_hdr->b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
+ buf->b_efunc = func;
+ buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up. If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ arc_buf_t **bufp;
+
+ mutex_enter(&buf->b_evict_lock);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts().
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&buf->b_evict_lock);
+ return (0);
+ } else if (buf->b_data == NULL) {
+ arc_buf_t copy = *buf; /* structure assignment */
+ /*
+ * We are on the eviction list; process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
+ */
+ buf->b_efunc = NULL;
+ mutex_exit(&buf->b_evict_lock);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ /*
+ * Pull this buffer off of the hdr
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_data != NULL);
+ arc_buf_destroy(buf, FALSE, FALSE);
+
+ if (hdr->b_datacnt == 0) {
+ arc_state_t *old_state = hdr->b_state;
+ arc_state_t *evicted_state;
+
+ ASSERT(hdr->b_buf == NULL);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ evicted_state =
+ (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ mutex_exit(hash_lock);
+ mutex_exit(&buf->b_evict_lock);
+
+ VERIFY(buf->b_efunc(buf) == 0);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_hdr = NULL;
+ buf->b_next = NULL;
+ kmem_cache_free(buf_cache, buf);
+ return (1);
+}
+
+/*
+ * Release this buffer from the cache. This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock = NULL;
+ l2arc_buf_hdr_t *l2hdr;
+ uint64_t buf_size;
+
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
+ hdr = buf->b_hdr;
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+ if (hdr->b_state == arc_anon) {
+ /* this buffer is already released */
+ ASSERT(buf->b_efunc == NULL);
+ } else {
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ }
+
+ l2hdr = hdr->b_l2hdr;
+ if (l2hdr) {
+ mutex_enter(&l2arc_buflist_mtx);
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_datacnt > 1) {
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t **bufp;
+ uint64_t blksz = hdr->b_size;
+ uint64_t spa = hdr->b_spa;
+ arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
+
+ ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+ /*
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr.
+ */
+ (void) remove_reference(hdr, hash_lock, tag);
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+ buf->b_next = NULL;
+
+ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_refcnt)) {
+ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3U(*size, >=, hdr->b_size);
+ atomic_add_64(size, -hdr->b_size);
+ }
+ hdr->b_datacnt -= 1;
+ arc_cksum_verify(buf);
+
+ mutex_exit(hash_lock);
+
+ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ nhdr->b_size = blksz;
+ nhdr->b_spa = spa;
+ nhdr->b_type = type;
+ nhdr->b_buf = buf;
+ nhdr->b_state = arc_anon;
+ nhdr->b_arc_access = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
+ nhdr->b_datacnt = 1;
+ nhdr->b_freeze_cksum = NULL;
+ (void) refcount_add(&nhdr->b_refcnt, tag);
+ buf->b_hdr = nhdr;
+ mutex_exit(&buf->b_evict_lock);
+ atomic_add_64(&arc_anon->arcs_size, blksz);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(refcount_count(&hdr->b_refcnt) == 1);
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_arc_access = 0;
+ if (hash_lock)
+ mutex_exit(hash_lock);
+
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+ }
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ mutex_exit(&l2arc_buflist_mtx);
+ }
+}
+
+/*
+ * Release this buffer. If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb)
+{
+ arc_release(buf, tag);
+ return (0);
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ int released;
+
+ mutex_enter(&buf->b_evict_lock);
+ released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ mutex_exit(&buf->b_evict_lock);
+ return (released);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+ int callback;
+
+ mutex_enter(&buf->b_evict_lock);
+ callback = (buf->b_efunc != NULL);
+ mutex_exit(&buf->b_evict_lock);
+ return (callback);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ int referenced;
+
+ mutex_enter(&buf->b_evict_lock);
+ referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ mutex_exit(&buf->b_evict_lock);
+ return (referenced);
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ /*
+ * If the IO is already in progress, then this is a re-write
+ * attempt, so we need to thaw and re-compute the cksum.
+ * It is the responsibility of the callback to handle the
+ * accounting for any re-write attempt.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ mutex_enter(&hdr->b_freeze_lock);
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_freeze_lock);
+ }
+ arc_cksum_compute(buf, B_FALSE);
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(hdr->b_acb == NULL);
+
+ if (zio->io_error == 0) {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ } else {
+ ASSERT(BUF_EMPTY(hdr));
+ }
+
+ /*
+ * If the block to be written was all-zero, we may have
+ * compressed it away. In this case no write was performed
+ * so there will be no dva/birth/checksum. The buffer must
+ * therefore remain anonymous (and uncached).
+ */
+ if (!BUF_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ ASSERT(zio->io_error == 0);
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
+ }
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ /* if it's not anon, we are doing a scrub */
+ if (!exists && hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ }
+
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+
+ ASSERT(ready != NULL);
+ ASSERT(done != NULL);
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+ ASSERT(hdr->b_acb == NULL);
+ if (l2arc)
+ hdr->b_flags |= ARC_L2CACHE;
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
+ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+
+ return (zio);
+}
+
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+{
+#ifdef _KERNEL
+ uint64_t available_memory = ptob(freemem);
+ static uint64_t page_load = 0;
+ static uint64_t last_txg = 0;
+
+#if defined(__i386)
+ available_memory =
+ MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+ if (available_memory >= zfs_write_limit_max)
+ return (0);
+
+ if (txg > last_txg) {
+ last_txg = txg;
+ page_load = 0;
+ }
+ /*
+ * If we are in pageout, we know that memory is already tight,
+ * the arc is already going to be evicting, so we just want to
+ * continue to let page writes occur as quickly as possible.
+ */
+ if (curproc == proc_pageout) {
+ if (page_load > MAX(ptob(minfree), available_memory) / 4)
+ return (ERESTART);
+ /* Note: reserve is inflated, so we deflate */
+ page_load += reserve / 8;
+ return (0);
+ } else if (page_load > 0 && arc_reclaim_needed()) {
+ /* memory is low, delay before restarting */
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (EAGAIN);
+ }
+ page_load = 0;
+
+ if (arc_size > arc_c_min) {
+ uint64_t evictable_memory =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+ }
+
+ if (inflight_data > available_memory / 4) {
+ ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ return (ERESTART);
+ }
+#endif
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t reserve)
+{
+ atomic_add_64(&arc_tempreserve, -reserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
+{
+ int error;
+ uint64_t anon_size;
+
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (ERESTART);
+ }
+#endif
+ if (reserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, reserve * 4);
+ if (reserve > arc_c)
+ return (ENOMEM);
+
+ /*
+ * Don't count loaned bufs as in flight dirty data to prevent long
+ * network delays from blocking transactions that are ready to be
+ * assigned to a txg.
+ */
+ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+
+ /*
+ * Writes will, almost always, require additional memory allocations
+ * in order to compress/encrypt/etc the data. We therefor need to
+ * make sure that there is sufficient available memory for this.
+ */
+ if (error = arc_memory_throttle(reserve, anon_size, txg))
+ return (error);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ */
+
+ if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
+ anon_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+ reserve>>10, arc_c>>10);
+ return (ERESTART);
+ }
+ atomic_add_64(&arc_tempreserve, reserve);
+ return (0);
+}
+
+void
+arc_init(void)
+{
+ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Convert seconds to clock ticks */
+ arc_min_prefetch_lifespan = 1 * hz;
+
+ /* Start out with 1/8 of all memory */
+ arc_c = physmem * PAGESIZE / 8;
+
+#ifdef _KERNEL
+ /*
+ * On architectures where the physical memory can be larger
+ * than the addressable space (intel in 32-bit mode), we may
+ * need to limit the cache to 1/8 of VM size.
+ */
+ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+
+ /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
+ arc_c_min = MAX(arc_c / 4, 64<<20);
+ /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
+ if (arc_c * 8 >= 1<<30)
+ arc_c_max = (arc_c * 8) - (1<<30);
+ else
+ arc_c_max = arc_c_min;
+ arc_c_max = MAX(arc_c * 6, arc_c_max);
+
+ /*
+ * Allow the tunables to override our calculations if they are
+ * reasonable (ie. over 64MB)
+ */
+ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
+ arc_c_max = zfs_arc_max;
+ if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+ arc_c_min = zfs_arc_min;
+
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+
+ /* Allow the tunable to override if it is reasonable */
+ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+ arc_c_min = arc_meta_limit / 2;
+
+ if (zfs_arc_grow_retry > 0)
+ arc_grow_retry = zfs_arc_grow_retry;
+
+ if (zfs_arc_shrink_shift > 0)
+ arc_shrink_shift = zfs_arc_shrink_shift;
+
+ if (zfs_arc_p_min_shift > 0)
+ arc_p_min_shift = zfs_arc_p_min_shift;
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
+ arc_size = 0;
+
+ mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+
+ buf_init();
+
+ arc_thread_exit = 0;
+ arc_eviction_list = NULL;
+ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
+ bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ kstat_install(arc_ksp);
+ }
+
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+ arc_dead = FALSE;
+ arc_warm = B_FALSE;
+
+ if (zfs_write_limit_max == 0)
+ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+ else
+ zfs_write_limit_shift = 0;
+ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+arc_fini(void)
+{
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_thread_exit = 1;
+ while (arc_thread_exit != 0)
+ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ arc_flush(NULL);
+
+ arc_dead = TRUE;
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ mutex_destroy(&arc_eviction_mtx);
+ mutex_destroy(&arc_reclaim_thr_lock);
+ cv_destroy(&arc_reclaim_thr_cv);
+
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+
+ mutex_destroy(&arc_anon->arcs_mtx);
+ mutex_destroy(&arc_mru->arcs_mtx);
+ mutex_destroy(&arc_mru_ghost->arcs_mtx);
+ mutex_destroy(&arc_mfu->arcs_mtx);
+ mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ mutex_destroy(&arc_l2c_only->arcs_mtx);
+
+ mutex_destroy(&zfs_write_limit_lock);
+
+ buf_fini();
+
+ ASSERT(arc_loaned_bytes == 0);
+}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static. Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster. Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_write_boost extra write bytes during device warmup
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ * l2arc_write_eligible() check if a buffer is eligible to cache
+ * l2arc_write_size() calculate how much to write
+ * l2arc_write_interval() calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
+ */
+
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+{
+ /*
+ * A buffer is *not* eligible for the L2ARC if it:
+ * 1. belongs to a different spa.
+ * 2. is already cached on the L2ARC.
+ * 3. has an I/O in progress (it may be an incomplete read).
+ * 4. is flagged not eligible (zfs property).
+ */
+ if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
+ HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(l2arc_dev_t *dev)
+{
+ uint64_t size;
+
+ size = dev->l2ad_write;
+
+ if (arc_warm == B_FALSE)
+ size += dev->l2ad_boost;
+
+ return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+ clock_t interval, next, now;
+
+ /*
+ * If the ARC lists are busy, increase our write rate; if the
+ * lists are stale, idle back. This is achieved by checking
+ * how much we previously wrote - if it was more than half of
+ * what we wanted, schedule the next write much sooner.
+ */
+ if (l2arc_feed_again && wrote > (wanted / 2))
+ interval = (hz * l2arc_feed_min_ms) / 1000;
+ else
+ interval = hz * l2arc_feed_secs;
+
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
+
+ return (next);
+}
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *first, *next = NULL;
+
+ /*
+ * Lock out the removal of spas (spa_namespace_lock), then removal
+ * of cache devices (l2arc_dev_mtx). Once a device has been selected,
+ * both locks will be dropped and a spa config lock held instead.
+ */
+ mutex_enter(&spa_namespace_lock);
+ mutex_enter(&l2arc_dev_mtx);
+
+ /* if there are no vdevs, there is nothing to do */
+ if (l2arc_ndev == 0)
+ goto out;
+
+ first = NULL;
+ next = l2arc_dev_last;
+ do {
+ /* loop around the list looking for a non-faulted vdev */
+ if (next == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, next);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ /* if we have come back to the start, bail out */
+ if (first == NULL)
+ first = next;
+ else if (next == first)
+ break;
+
+ } while (vdev_is_dead(next->l2ad_vdev));
+
+ /* if we were unable to find any usable vdevs, return NULL */
+ if (vdev_is_dead(next->l2ad_vdev))
+ next = NULL;
+
+ l2arc_dev_last = next;
+
+out:
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Grab the config lock to prevent the 'next' device from being
+ * removed while we are writing to it.
+ */
+ if (next != NULL)
+ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+ mutex_exit(&spa_namespace_lock);
+
+ return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ l2arc_buf_hdr_t *abl2;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - drop L2ARC entry.
+ */
+ list_remove(buflist, ab);
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ l2arc_do_free_on_write();
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ int equal;
+
+ ASSERT(zio->io_vd != NULL);
+ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ buf = cb->l2rcb_buf;
+ ASSERT(buf != NULL);
+
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ equal = arc_cksum_equal(buf);
+ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = buf;
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+ arc_read_done(zio);
+ } else {
+ mutex_exit(hash_lock);
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ } else {
+ zio->io_error = EIO;
+ }
+ if (!equal)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ /*
+ * If there's no waiter, issue an async i/o to the primary
+ * storage now. If there *is* a waiter, the caller must
+ * issue the i/o in a context where it's OK to block.
+ */
+ if (zio->io_waiter == NULL) {
+ zio_t *pio = zio_unique_parent(zio);
+
+ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
+ buf->b_data, zio->io_size, arc_read_done, buf,
+ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+ }
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+ list_t *list;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 1:
+ list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ case 2:
+ list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 3:
+ list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ }
+
+ ASSERT(!(MUTEX_HELD(*lock)));
+ mutex_enter(*lock);
+ return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ buflist = dev->l2ad_buflist;
+
+ if (buflist == NULL)
+ return;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&l2arc_buflist_mtx);
+ for (ab = list_tail(buflist); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&l2arc_buflist_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ if (HDR_L2_WRITE_HEAD(ab)) {
+ /*
+ * We hit a write head node. Leave it for
+ * l2arc_write_done().
+ */
+ list_remove(buflist, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (!all && ab->b_l2hdr != NULL &&
+ (ab->b_l2hdr->b_daddr > taddr ||
+ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (HDR_FREE_IN_PROGRESS(ab)) {
+ /*
+ * Already on the path to destruction.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_state == arc_l2c_only) {
+ ASSERT(!HDR_L2_READING(ab));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_size.
+ */
+ arc_change_state(arc_anon, ab, hash_lock);
+ arc_hdr_destroy(ab);
+ } else {
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(ab)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ ab->b_flags |= ARC_L2_EVICTED;
+ }
+
+ /*
+ * Tell ARC this no longer exists in L2ARC.
+ */
+ if (ab->b_l2hdr != NULL) {
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+ list_remove(buflist, ab);
+
+ /*
+ * This may have been leftover after a
+ * failed write.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
+ dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static uint64_t
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+ arc_buf_hdr_t *ab, *ab_prev, *head;
+ l2arc_buf_hdr_t *hdrl2;
+ list_t *list;
+ uint64_t passed_sz, write_sz, buf_sz, headroom;
+ void *buf_data;
+ kmutex_t *hash_lock, *list_lock;
+ boolean_t have_lock, full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_guid(spa);
+
+ ASSERT(dev->l2ad_vdev != NULL);
+
+ pio = NULL;
+ write_sz = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head->b_flags |= ARC_L2_WRITE_HEAD;
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ for (int try = 0; try <= 3; try++) {
+ list = l2arc_list_locked(try, &list_lock);
+ passed_sz = 0;
+
+ /*
+ * L2ARC fast warmup.
+ *
+ * Until the ARC is warm and starts to evict, read from the
+ * head of the ARC lists rather than the tail.
+ */
+ headroom = target_sz * l2arc_headroom;
+ if (arc_warm == B_FALSE)
+ ab = list_head(list);
+ else
+ ab = list_tail(list);
+
+ for (; ab; ab = ab_prev) {
+ if (arc_warm == B_FALSE)
+ ab_prev = list_next(list, ab);
+ else
+ ab_prev = list_prev(list, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (!have_lock && !mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += ab->b_size;
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (!l2arc_write_eligible(guid, ab)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if ((write_sz + ab->b_size) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ list_insert_head(dev->l2ad_buflist, head);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ /*
+ * Create and add a new L2ARC header.
+ */
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2->b_dev = dev;
+ hdrl2->b_daddr = dev->l2ad_hand;
+
+ ab->b_flags |= ARC_L2_WRITING;
+ ab->b_l2hdr = hdrl2;
+ list_insert_head(dev->l2ad_buflist, ab);
+ buf_data = ab->b_buf->b_data;
+ buf_sz = ab->b_size;
+
+ /*
+ * Compute and store the buffer cksum before
+ * writing. On debug the cksum is verified first.
+ */
+ arc_cksum_verify(ab->b_buf);
+ arc_cksum_compute(ab->b_buf, B_TRUE);
+
+ mutex_exit(hash_lock);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ /*
+ * Keep the clock hand suitably device-aligned.
+ */
+ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+ write_sz += buf_sz;
+ dev->l2ad_hand += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ if (pio == NULL) {
+ ASSERT3U(write_sz, ==, 0);
+ kmem_cache_free(hdr_cache, head);
+ return (0);
+ }
+
+ ASSERT3U(write_sz, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+ ARCSTAT_INCR(arcstat_l2_size, write_sz);
+ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+ vdev_space_update(dev->l2ad_vdev,
+ dev->l2ad_end - dev->l2ad_hand, 0, 0);
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ dev->l2ad_writing = B_TRUE;
+ (void) zio_wait(pio);
+ dev->l2ad_writing = B_FALSE;
+
+ return (write_sz);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ uint64_t size, wrote;
+ clock_t begin, next = ddi_get_lbolt();
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ next);
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+ next = ddi_get_lbolt() + hz;
+
+ /*
+ * Quick check for L2ARC devices.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+ begin = ddi_get_lbolt();
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa. This
+ * will return NULL if there are now no l2arc devices or if
+ * they are all faulted.
+ *
+ * If a device is returned, its spa's config lock is also
+ * held to prevent device removal. l2arc_dev_get_next()
+ * will grab and release l2arc_dev_mtx.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL)
+ continue;
+
+ spa = dev->l2ad_spa;
+ ASSERT(spa != NULL);
+
+ /*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ size = l2arc_write_size(dev);
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, size, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ wrote = l2arc_write_buffers(spa, dev, size);
+
+ /*
+ * Calculate interval between writes.
+ */
+ next = l2arc_write_interval(begin, size, wrote);
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
+
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev != NULL;
+ dev = list_next(l2arc_dev_list, dev)) {
+ if (dev->l2ad_vdev == vd)
+ break;
+ }
+ mutex_exit(&l2arc_dev_mtx);
+
+ return (dev != NULL);
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+{
+ l2arc_dev_t *adddev;
+
+ ASSERT(!l2arc_vdev_present(vd));
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_write = l2arc_write_max;
+ adddev->l2ad_boost = l2arc_write_boost;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ adddev->l2ad_writing = B_FALSE;
+ ASSERT3U(adddev->l2ad_write, >, 0);
+
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2node));
+
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT(remdev != NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(remdev->l2ad_buflist);
+ kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+ /*
+ * This is called from dmu_fini(), which is called from spa_fini();
+ * Because of this, we can assume that all l2arc devices have
+ * already been removed when the pools themselves were removed.
+ */
+
+ l2arc_do_free_on_write();
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_buflist_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+ if (!(spa_mode_global & FWRITE))
+ return;
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+ if (!(spa_mode_global & FWRITE))
+ return;
+
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+}
diff --git a/uts/common/fs/zfs/bplist.c b/uts/common/fs/zfs/bplist.c
new file mode 100644
index 000000000000..066ccc6b1e05
--- /dev/null
+++ b/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+
+void
+bplist_create(bplist_t *bpl)
+{
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
+}
+
+void
+bplist_destroy(bplist_t *bpl)
+{
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
+}
+
+void
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
+{
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
+
+ mutex_enter(&bpl->bpl_lock);
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
+{
+ bplist_entry_t *bpe;
+
+ mutex_enter(&bpl->bpl_lock);
+ while (bpe = list_head(&bpl->bpl_list)) {
+ list_remove(&bpl->bpl_list, bpe);
+ mutex_exit(&bpl->bpl_lock);
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
diff --git a/uts/common/fs/zfs/bpobj.c b/uts/common/fs/zfs/bpobj.c
new file mode 100644
index 000000000000..72be31235607
--- /dev/null
+++ b/uts/common/fs/zfs/bpobj.c
@@ -0,0 +1,495 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+ boolean_t free)
+{
+ dmu_object_info_t doi;
+ int epb;
+ int64_t i;
+ int err = 0;
+ dmu_buf_t *dbuf = NULL;
+
+ mutex_enter(&bpo->bpo_lock);
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ blkptr_t *bparray;
+ blkptr_t *bp;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (blkptr_t);
+ blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+ FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ bparray = dbuf->db_data;
+ bp = &bparray[blkoff];
+ err = func(arg, bp, tx);
+ if (err)
+ break;
+ if (free) {
+ bpo->bpo_phys->bpo_bytes -=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ i++;
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+ i * sizeof (blkptr_t), -1ULL, tx));
+ }
+ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ ASSERT(bpo->bpo_havecomp);
+ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+ if (err) {
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+ }
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+ bpobj_t sublist;
+ uint64_t used_before, comp_before, uncomp_before;
+ uint64_t used_after, comp_after, uncomp_after;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+ if (err)
+ break;
+ if (free) {
+ err = bpobj_space(&sublist,
+ &used_before, &comp_before, &uncomp_before);
+ if (err)
+ break;
+ }
+ err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+ if (free) {
+ VERIFY3U(0, ==, bpobj_space(&sublist,
+ &used_after, &comp_after, &uncomp_after));
+ bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
+ bpo->bpo_phys->bpo_uncomp -=
+ uncomp_before - uncomp_after;
+ }
+
+ bpobj_close(&sublist);
+ if (err)
+ break;
+ if (free) {
+ err = dmu_object_free(bpo->bpo_os,
+ objarray[blkoff], tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_num_subobjs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ (i + 1) * sizeof (uint64_t), -1ULL, tx));
+ }
+
+out:
+ /* If there are no entries, there should be no bytes. */
+ ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+ bpo->bpo_phys->bpo_bytes == 0);
+
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp, subsubobjs;
+
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+ if (used == 0) {
+ /* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces
+ * recursion in bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset == doi.doi_data_block_size) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+ subsubobjs, tx));
+ }
+ }
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+
+ bpobj_close(&subbpo);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /* We never need the fill count. */
+ stored_bp.blk_fill = 0;
+
+ /* The bpobj will compress better if we can leave off the checksum */
+ if (!BP_GET_DEDUP(bp))
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ bpo->bpo_phys->bpo_bytes +=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 000000000000..9c4e0296db2b
--- /dev/null
+++ b/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2707 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ refcount_create(&db->db_holds);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+ return (crc);
+}
+
+#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid = db->db_blkid;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, 1);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ /*
+ * We musn't hold db_mtx to maintin lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static arc_evict_func_t dbuf_do_evict;
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level != 0 || db->db_evict_func == NULL)
+ return;
+
+ if (db->db_user_data_ptr_ptr)
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ db->db_evict_func(&db->db, db->db_user_ptr);
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+}
+
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+ */
+ while (hsize * 4096 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !list_is_empty(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, 0);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ /*
+ * We can't assert that db_size matches dn_datablksz because it
+ * can be momentarily different when another thread is doing
+ * dnode_set_blksz().
+ */
+ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dr = db->db_data_pending;
+ /*
+ * It should only be modified in syncing context, so
+ * make sure we only have one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ */
+ if (db->db_dirtycnt == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ }
+ }
+ DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+ db->db_buf = buf;
+ if (buf != NULL) {
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+ if (!arc_released(buf))
+ arc_set_callback(buf, dbuf_do_evict, db);
+ dbuf_update_data(db);
+ } else {
+ dbuf_evict_user(db);
+ db->db.db_data = NULL;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ spa_t *spa;
+
+ mutex_exit(&db->db_mtx);
+ DB_GET_SPA(&spa, db);
+ abuf = arc_loan_buf(spa, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ dbuf_set_data(db, NULL);
+ mutex_exit(&db->db_mtx);
+ }
+ return (abuf);
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+ if (dn->dn_datablkshift) {
+ return (offset >> dn->dn_datablkshift);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* we were freed in flight; disregard any error */
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else if (zio == NULL || zio->io_error == 0) {
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ db->db_state = DB_UNCACHED;
+ }
+ cv_broadcast(&db->db_changed);
+ dbuf_rele_and_unlock(db, NULL);
+}
+
+static void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+{
+ dnode_t *dn;
+ spa_t *spa;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_NOWAIT;
+ arc_buf_t *pbuf;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
+ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ if (bonuslen < DN_MAX_BONUSLEN)
+ bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ if (bonuslen)
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
+ dbuf_update_data(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ /*
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr)))) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ DB_DNODE_EXIT(db);
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_state = DB_CACHED;
+ *flags |= DB_RF_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ spa = dn->dn_objset->os_spa;
+ DB_DNODE_EXIT(db);
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_L2CACHE;
+
+ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ dbuf_add_ref(db, NULL);
+ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+
+ if (db->db_parent)
+ pbuf = db->db_parent->db_buf;
+ else
+ pbuf = db->db_objset->os_phys_buf;
+
+ (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+ if (aflags & ARC_CACHED)
+ *flags |= DB_RF_CACHED;
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ int havepzio = (zio != NULL);
+ int prefetch;
+ dnode_t *dn;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if (db->db_state == DB_NOFILL)
+ return (EIO);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
+ DBUF_IS_CACHEABLE(db);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+ } else if (db->db_state == DB_UNCACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ dbuf_read_impl(db, zio, &flags);
+
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+ db->db.db_size, flags & DB_RF_CACHED);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ if (!havepzio)
+ err = zio_wait(zio);
+ } else {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ mutex_enter(&db->db_mtx);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+
+ ASSERT(err || havepzio || db->db_state == DB_CACHED);
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ DB_GET_SPA(&spa, db);
+ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
+ db->db_state = DB_FILL;
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_set_data(db, NULL);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ dbuf_set_data(db, NULL);
+ }
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ ASSERT(db->db_data_pending != dr);
+
+ /* free this block */
+ if (!BP_IS_HOLE(bp)) {
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ zio_free(spa, txg, bp);
+ }
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
+void
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t first_l1 = start >> epbs;
+ uint64_t last_l1 = end >> epbs;
+
+ if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
+ end = dn->dn_maxblkid;
+ last_l1 = end >> epbs;
+ }
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ if (db->db_level == 1 &&
+ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_last_dirty &&
+ db->db_last_dirty->dr_txg < txg) {
+ dbuf_add_ref(db, FTAG);
+ mutex_exit(&db->db_mtx);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
+ if (db->db_level != 0)
+ continue;
+ dprintf_dbuf(db, "found buf %s\n", "");
+ if (db->db_blkid < start || db->db_blkid > end)
+ continue;
+
+ /* found a level 0 buffer in the range */
+ if (dbuf_undirty(db, tx))
+ continue;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_clear(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid != DMU_SPILL_BLKID &&
+ db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_block_freeable(dmu_buf_impl_t *db)
+{
+ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+ uint64_t birth_txg = 0;
+
+ /*
+ * We don't need any locking to protect db_blkptr:
+ * If it's syncing, then db_last_dirty will be set
+ * so we'll ignore db_blkptr.
+ */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_last_dirty)
+ birth_txg = db->db_last_dirty->dr_txg;
+ else if (db->db_blkptr)
+ birth_txg = db->db_blkptr->blk_birth;
+
+ /*
+ * If we don't exist or are in a snapshot, we can't be freed.
+ * Don't pass the bp to dsl_dataset_block_freeable() since we
+ * are holding the db_mtx lock and might deadlock if we are
+ * prefetching a dedup-ed block.
+ */
+ if (birth_txg)
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, NULL, birth_txg));
+ else
+ return (FALSE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ /*
+ * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dbuf_will_dirty(db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+ db->db.db_size = size;
+
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
+ mutex_exit(&db->db_mtx);
+
+ dnode_willuse_space(dn, size-osize, tx);
+ DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os;
+ zbookmark_t zb;
+
+ DB_GET_OBJSET(&os, db);
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ zb.zb_objset = os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ (void) arc_release_bp(db->db_buf, db,
+ db->db_blkptr, os->os_spa, &zb);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ objset_t *os;
+ dbuf_dirty_record_t **drp, *dr;
+ int drop_struct_lock = FALSE;
+ boolean_t do_free_accounting = B_FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ dn->dn_objset->os_dsl_dataset == NULL);
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx =
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL)
+ arc_buf_thaw(db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ return (dr);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_object == 0 ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos or we're initializing the os or it's a special object.
+ * However, we are allowed to dirty in syncing context provided
+ * we already dirtied it in open context. Hence we must make
+ * this assertion only if we're not already dirty.
+ */
+ os = dn->dn_objset;
+ ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ * Note: we delay "free accounting" until after we drop
+ * the db_mtx. This keeps us from grabbing other locks
+ * (and possibly deadlocking) in bp_get_dsize() while
+ * also holding the db_mtx.
+ */
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ do_free_accounting = dbuf_block_freeable(db);
+ }
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ }
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+ } else if (do_free_accounting) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dsize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ ddt_prefetch(os->os_spa, bp);
+ dnode_willuse_space(dn, -willfree, tx);
+ }
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ ASSERT(parent != NULL);
+ parent_held = TRUE;
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /* possible race with dbuf_undirty() */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
+ return (dr);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr, **drp;
+
+ ASSERT(txg != 0);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg) {
+ mutex_exit(&db->db_mtx);
+ return (0);
+ }
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ /*
+ * If this buffer is currently held, we cannot undirty
+ * it, since one of the current holders may be in the
+ * middle of an update. Note that users of dbuf_undirty()
+ * should not place a hold on the dbuf before the call.
+ */
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ /* Make sure we don't toss this buffer at sync phase */
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ DB_DNODE_EXIT(db);
+ return (0);
+ }
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+
+ *drp = dr->dr_next;
+
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_level+1 == dn->dn_nlevels) {
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0) {
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ }
+ } else {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ arc_buf_t *buf = db->db_buf;
+
+ ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ return (1);
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
+ (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(db->db_level == 0);
+ ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+ ASSERT(buf != NULL);
+ ASSERT(arc_buf_size(buf) == db->db.db_size);
+ ASSERT(tx->tx_txg != 0);
+
+ arc_return_buf(buf, db);
+ ASSERT(arc_released(buf));
+
+ mutex_enter(&db->db_mtx);
+
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+ if (db->db_state == DB_CACHED &&
+ refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ xuio_stat_wbuf_copied();
+ return;
+ }
+
+ xuio_stat_wbuf_nocopy();
+ if (db->db_state == DB_CACHED) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(db->db_buf != NULL);
+ if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+ ASSERT(dr->dt.dl.dr_data == db->db_buf);
+ if (!arc_released(db->db_buf)) {
+ ASSERT(dr->dt.dl.dr_override_state ==
+ DR_OVERRIDDEN);
+ arc_release(db->db_buf, db);
+ }
+ dr->dt.dl.dr_data = buf;
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+ } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+ arc_release(db->db_buf, db);
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+ }
+ db->db_buf = NULL;
+ }
+ ASSERT(db->db_buf == NULL);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_FILL;
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ dbuf_fill_done(db, tx);
+}
+
+/*
+ * "Clear" the contents of this dbuf. This will mark the dbuf
+ * EVICTING and clear *most* of its references. Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
+ * in this case. For callers from the DMU we will usually see:
+ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ * DMU: dbuf_clear()->arc_buf_evict()
+ * ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb;
+ int dbuf_gone = FALSE;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ dbuf_evict_user(db);
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ }
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
+ ASSERT(db->db_data_pending == NULL);
+
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
+ if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
+ dnode_rele(dn, db);
+ db->db_dnode_handle = NULL;
+ } else {
+ DB_DNODE_EXIT(db);
+ }
+
+ if (db->db_buf)
+ dbuf_gone = arc_buf_evict(db->db_buf);
+
+ if (!dbuf_gone)
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If this dbuf is referenced from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb)
+ dbuf_rele(parent, db);
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ int nlevels, epbs;
+
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_have_spill &&
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ *bpp = &dn->dn_phys->dn_spill;
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ if (dn->dn_phys->dn_nlevels == 0)
+ nlevels = 1;
+ else
+ nlevels = dn->dn_phys->dn_nlevels;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (level >= nlevels ||
+ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (ENOENT);
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, NULL, parentp);
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt = 0;
+ db->db_dnode_handle = dn->dn_handle;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+ db->db_immediate_evict = 0;
+ db->db_freed_in_flight = 0;
+
+ if (blkid == DMU_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ db->db.db_offset = DMU_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ /* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+ return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
+ } else {
+ int blocksize =
+ db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return (odb);
+ }
+ list_insert_head(&dn->dn_dbufs, db);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, db);
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+static int
+dbuf_do_evict(void *private)
+{
+ arc_buf_t *buf = private;
+ dmu_buf_impl_t *db = buf->b_private;
+
+ if (!MUTEX_HELD(&db->db_mtx))
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_state != DB_EVICTING) {
+ ASSERT(db->db_state == DB_CACHED);
+ DBUF_VERIFY(db);
+ db->db_buf = NULL;
+ dbuf_evict(db);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dbuf_destroy(db);
+ }
+ return (0);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this dbuf is still on the dn_dbufs list,
+ * remove it from that list.
+ */
+ if (db->db_dnode_handle != NULL) {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold
+ * corresponding to the removed dbuf is no longer
+ * discounted in dnode_move(), so the dnode cannot be
+ * moved until after we release the hold.
+ */
+ dnode_rele(dn, db);
+ db->db_dnode_handle = NULL;
+ }
+ dbuf_hash_remove(db);
+ }
+ db->db_parent = NULL;
+ db->db_buf = NULL;
+
+ ASSERT(!list_link_active(&db->db_link));
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ kmem_cache_free(dbuf_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db = NULL;
+ blkptr_t *bp = NULL;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /* dbuf_find() returns with db_mtx held */
+ if (db = dbuf_find(dn, 0, blkid)) {
+ /*
+ * This dbuf is already in the cache. We assume that
+ * it is already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+ if (bp && !BP_IS_HOLE(bp)) {
+ int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
+ ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
+ arc_buf_t *pbuf;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, 0, blkid);
+
+ if (db)
+ pbuf = db->db_buf;
+ else
+ pbuf = dn->dn_objset->os_phys_buf;
+
+ (void) dsl_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, priority,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zb);
+ }
+ if (db)
+ dbuf_rele(db, NULL);
+ }
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ ASSERT(blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+top:
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ ASSERT3P(parent, ==, NULL);
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = ENOENT;
+ if (err) {
+ if (parent)
+ dbuf_rele(parent, NULL);
+ return (err);
+ }
+ }
+ if (err && err != ENOENT)
+ return (err);
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ arc_buf_add_ref(db->db_buf, db);
+ if (db->db_buf->b_data == NULL) {
+ dbuf_clear(db);
+ if (parent) {
+ dbuf_rele(parent, NULL);
+ parent = NULL;
+ }
+ goto top;
+ }
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db,
+ arc_buf_alloc(dn->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+ db->db.db_size);
+ }
+ }
+
+ (void) refcount_add(&db->db_holds, tag);
+ dbuf_update_data(db);
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent)
+ dbuf_rele(parent, NULL);
+
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (ENOTSUP);
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ if (blksz > SPA_MAXBLOCKSIZE)
+ blksz = SPA_MAXBLOCKSIZE;
+ else
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dbuf_new_size(db, blksz, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = refcount_add(&db->db_holds, tag);
+ ASSERT(holds > 1);
+}
+
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
+#pragma weak dmu_buf_rele = dbuf_rele
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ DBUF_VERIFY(db);
+
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
+ holds = refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+ arc_buf_freeze(db->db_buf);
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If the dnode moves here, we cannot cross this barrier
+ * until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+ DB_DNODE_EXIT(db);
+ /*
+ * The bonus buffer's dnode hold is no longer discounted
+ * in dnode_move(). The dnode cannot move until after
+ * the dnode_rele().
+ */
+ dnode_rele(DB_DNODE(db), db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
+ dbuf_evict(db);
+ } else if (arc_released(db->db_buf)) {
+ arc_buf_t *buf = db->db_buf;
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ } else {
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+ if (!DBUF_IS_CACHEABLE(db))
+ dbuf_clear(db);
+ else
+ mutex_exit(&db->db_mtx);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_immediate_evict = TRUE;
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(db->db_level == 0);
+
+ ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_user_ptr == old_user_ptr) {
+ db->db_user_ptr = user_ptr;
+ db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_evict_func = evict_func;
+
+ dbuf_update_data(db);
+ } else {
+ old_user_ptr = db->db_user_ptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ return (db->db_user_ptr);
+}
+
+boolean_t
+dmu_buf_freeable(dmu_buf_t *dbuf)
+{
+ boolean_t res = B_FALSE;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+
+ if (db->db_blkptr)
+ res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
+ db->db_blkptr, db->db_blkptr->blk_birth);
+
+ return (res);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = &dn->dn_phys->dn_spill;
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, db, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_buf != NULL);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
+
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+ dbuf_write(dr, db->db_buf, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ uint64_t txg = tx->tx_txg;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
+ }
+ DBUF_VERIFY(db);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ dbuf_dirty_record_t **drp;
+
+ ASSERT(*datap != NULL);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ DB_DNODE_EXIT(db);
+
+ if (*datap != db->db.db_data) {
+ zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ }
+ db->db_data_pending = NULL;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT(dr->dr_next == NULL);
+ ASSERT(dr->dr_dbuf == db);
+ *drp = dr->dr_next;
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+ return;
+ }
+
+ os = dn->dn_objset;
+
+ /*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this buffer is in the middle of an immediate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+ *datap == db->db_buf) {
+ /*
+ * If this buffer is currently "in use" (i.e., there
+ * are active holds and db_data still references it),
+ * then make a copy before we start the write so that
+ * any modifications from the open txg will not leak
+ * into this write.
+ *
+ * NOTE: this copy does not need to be made for
+ * objects only modified in the syncing context (e.g.
+ * DNONE_DNODE blocks).
+ */
+ int blksz = arc_buf_size(*datap);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ }
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write(dr, *datap, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
+ zio_nowait(dr->dr_zio);
+ }
+}
+
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
+ uint64_t fill = 0;
+ int i;
+
+ ASSERT(db->db_blkptr == bp);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
+
+ if (BP_IS_HOLE(bp)) {
+ ASSERT(bp->blk_fill == 0);
+ DB_DNODE_EXIT(db);
+ return;
+ }
+
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype));
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
+ mutex_enter(&db->db_mtx);
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ }
+#endif
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ dnode_phys_t *dnp = db->db.db_data;
+ for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+ i--, dnp++) {
+ if (dnp->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ } else {
+ fill = 1;
+ }
+ } else {
+ blkptr_t *ibp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
+ continue;
+ fill += ibp->blk_fill;
+ }
+ }
+ DB_DNODE_EXIT(db);
+
+ bp->blk_fill = fill;
+
+ mutex_exit(&db->db_mtx);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint64_t txg = zio->io_txg;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT3U(zio->io_error, ==, 0);
+ ASSERT(db->db_blkptr == bp);
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+
+ DB_GET_OBJSET(&os, db);
+ ds = os->os_dsl_dataset;
+ tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ DBUF_VERIFY(db);
+
+ drp = &db->db_last_dirty;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
+
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ DB_DNODE_EXIT(db);
+ }
+#endif
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ else if (!arc_released(db->db_buf))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ ASSERT3U(dn->dn_phys->dn_maxblkid
+ >> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ DB_DNODE_EXIT(db);
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
+ mutex_exit(&db->db_mtx);
+
+ dbuf_write_done(zio, NULL, db);
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn;
+ objset_t *os;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+ int wp_flag = 0;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ dbuf_release_bp(db);
+ }
+ }
+ }
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
+
+ if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ ASSERT(db->db_state != DB_NOFILL);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ dbuf_write_override_ready, dbuf_write_override_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, NULL, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+ dr->dr_zio = arc_write(zio, os->os_spa, txg,
+ db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
diff --git a/uts/common/fs/zfs/ddt.c b/uts/common/fs/zfs/ddt.c
new file mode 100644
index 000000000000..718331496765
--- /dev/null
+++ b/uts/common/fs/zfs/ddt.c
@@ -0,0 +1,1146 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+
+ /*
+ * Seed the cached statistics.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+ return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_data != NULL)
+ zio_buf_free(dde->dde_repair_data,
+ DDK_GET_PSIZE(&dde->dde_key));
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT)
+ break;
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ASSERT(error == 0 || error == ENOENT);
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+ for (int i = 0; i < DDT_KEY_WORDS; i++) {
+ if (u1[i] < u2[i])
+ return (-1);
+ if (u1[i] > u2[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object, tx) == 0);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ count += ddt_object_count(ddt, type, class);
+ }
+ }
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (count == 0 && ddt_object_exists(ddt, type, class))
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ zio_t *rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (ENOENT);
+}
diff --git a/uts/common/fs/zfs/ddt_zap.c b/uts/common/fs/zfs/ddt_zap.c
new file mode 100644
index 000000000000..d6a991c7c19e
--- /dev/null
+++ b/uts/common/fs/zfs/ddt_zap.c
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t one, csize;
+ int error;
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ return (error);
+
+ ASSERT(one == 1);
+ ASSERT(csize <= sizeof (cbuf));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ return (error);
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+ return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+ uint64_t count = 0;
+
+ VERIFY(zap_count(os, object, &count) == 0);
+
+ return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_prefetch,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c
new file mode 100644
index 000000000000..39234eba53b2
--- /dev/null
+++ b/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1764 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/sa.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <sys/zfs_znode.h>
+#endif
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { byteswap_uint8_array, TRUE, "unallocated" },
+ { zap_byteswap, TRUE, "object directory" },
+ { byteswap_uint64_array, TRUE, "object array" },
+ { byteswap_uint8_array, TRUE, "packed nvlist" },
+ { byteswap_uint64_array, TRUE, "packed nvlist size" },
+ { byteswap_uint64_array, TRUE, "bpobj" },
+ { byteswap_uint64_array, TRUE, "bpobj header" },
+ { byteswap_uint64_array, TRUE, "SPA space map header" },
+ { byteswap_uint64_array, TRUE, "SPA space map" },
+ { byteswap_uint64_array, TRUE, "ZIL intent log" },
+ { dnode_buf_byteswap, TRUE, "DMU dnode" },
+ { dmu_objset_byteswap, TRUE, "DMU objset" },
+ { byteswap_uint64_array, TRUE, "DSL directory" },
+ { zap_byteswap, TRUE, "DSL directory child map"},
+ { zap_byteswap, TRUE, "DSL dataset snap map" },
+ { zap_byteswap, TRUE, "DSL props" },
+ { byteswap_uint64_array, TRUE, "DSL dataset" },
+ { zfs_znode_byteswap, TRUE, "ZFS znode" },
+ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
+ { byteswap_uint8_array, FALSE, "ZFS plain file" },
+ { zap_byteswap, TRUE, "ZFS directory" },
+ { zap_byteswap, TRUE, "ZFS master node" },
+ { zap_byteswap, TRUE, "ZFS delete queue" },
+ { byteswap_uint8_array, FALSE, "zvol object" },
+ { zap_byteswap, TRUE, "zvol prop" },
+ { byteswap_uint8_array, FALSE, "other uint8[]" },
+ { byteswap_uint64_array, FALSE, "other uint64[]" },
+ { zap_byteswap, TRUE, "other ZAP" },
+ { zap_byteswap, TRUE, "persistent error log" },
+ { byteswap_uint8_array, TRUE, "SPA history" },
+ { byteswap_uint64_array, TRUE, "SPA history offsets" },
+ { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "DSL permissions" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
+ { byteswap_uint8_array, TRUE, "FUID table" },
+ { byteswap_uint64_array, TRUE, "FUID table size" },
+ { zap_byteswap, TRUE, "DSL dataset next clones"},
+ { zap_byteswap, TRUE, "scan work queue" },
+ { zap_byteswap, TRUE, "ZFS user/group used" },
+ { zap_byteswap, TRUE, "ZFS user/group quota" },
+ { zap_byteswap, TRUE, "snapshot refcount tags"},
+ { zap_byteswap, TRUE, "DDT ZAP algorithm" },
+ { zap_byteswap, TRUE, "DDT statistics" },
+ { byteswap_uint8_array, TRUE, "System attributes" },
+ { zap_byteswap, TRUE, "SA master node" },
+ { zap_byteswap, TRUE, "SA attr registration" },
+ { zap_byteswap, TRUE, "SA attr layouts" },
+ { zap_byteswap, TRUE, "scan translations" },
+ { byteswap_uint8_array, FALSE, "deduplicated block" },
+ { zap_byteswap, TRUE, "DSL deadlist map" },
+ { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" },
+ { zap_byteswap, TRUE, "DSL dir clones" },
+ { byteswap_uint64_array, TRUE, "bpobj subobj" },
+};
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ blkid = dbuf_whichblock(dn, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL) {
+ err = EIO;
+ } else {
+ err = dbuf_read(db, NULL, db_flags);
+ if (err) {
+ dbuf_rele(db, tag);
+ db = NULL;
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ *dbp = &db->db; /* NULL db plus first field offset is NULL */
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = EINVAL;
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (type > DMU_OT_NUMTYPES) {
+ error = EINVAL;
+ } else if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_rm_spill(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1) {
+ VERIFY(dnode_add_ref(dn, db));
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
+
+ *dbp = &db->db;
+ return (0);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ ASSERT(db != NULL);
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else
+ dbuf_rele(db, tag);
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = EINVAL;
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = ENOENT;
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ DB_DNODE_EXIT(db);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+{
+ dsl_pool_t *dp = NULL;
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t dbuf_flags;
+ int err;
+ zio_t *zio;
+ hrtime_t start;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
+ if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+ dbuf_flags |= DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+ P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ rw_exit(&dn->dn_struct_rwlock);
+ return (EIO);
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ if (dn->dn_objset->os_dsl_dataset)
+ dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
+ if (dp && dsl_pool_sync_context(dp))
+ start = gethrtime();
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ zio_nowait(zio);
+ return (EIO);
+ }
+ /* initiate async i/o */
+ if (read) {
+ (void) dbuf_read(db, zio, dbuf_flags);
+ }
+ dbp[i] = &db->db;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ /* track read overhead when we are in sync context */
+ if (dp && dsl_pool_sync_context(dp))
+ dp->dp_read_overhead += gethrtime() - start;
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, i, err;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = DMU_META_DNODE(os);
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+ P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid+i);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+/*
+ * Get the next "chunk" of file data to free. We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state). We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ */
+static int
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
+{
+ uint64_t len = *start - limit;
+ uint64_t blkcnt = 0;
+ uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+ uint64_t iblkrange =
+ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT(limit <= *start);
+
+ if (len <= iblkrange * maxblks) {
+ *start = limit;
+ return (0);
+ }
+ ASSERT(ISP2(iblkrange));
+
+ while (*start > limit && blkcnt < maxblks) {
+ int err;
+
+ /* find next allocated L1 indirect */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, start, 2, 1, 0);
+
+ /* if there are no more, then we are done */
+ if (err == ESRCH) {
+ *start = limit;
+ return (0);
+ } else if (err) {
+ return (err);
+ }
+ blkcnt += 1;
+
+ /* reset offset to end of "next" block back */
+ *start = P2ALIGN(*start, iblkrange);
+ if (*start <= limit)
+ *start = limit;
+ else
+ *start -= 1;
+ }
+ return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length, boolean_t free_dnode)
+{
+ dmu_tx_t *tx;
+ uint64_t object_size, start, end, len;
+ boolean_t trunc = (length == DMU_OBJECT_END);
+ int align, err;
+
+ align = 1 << dn->dn_datablkshift;
+ ASSERT(align > 0);
+ object_size = align == 1 ? dn->dn_datablksz :
+ (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+ end = offset + length;
+ if (trunc || end > object_size)
+ end = object_size;
+ if (end <= offset)
+ return (0);
+ length = end - offset;
+
+ while (length) {
+ start = end;
+ /* assert(offset <= start) */
+ err = get_next_chunk(dn, &start, offset);
+ if (err)
+ return (err);
+ len = trunc ? DMU_OBJECT_END : end - start;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+ if (start == 0 && free_dnode) {
+ ASSERT(trunc);
+ dnode_free(dn, tx);
+ }
+
+ length -= end - start;
+
+ dmu_tx_commit(tx);
+ end = start;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_t *tx;
+ int err;
+
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err != 0)
+ return (err);
+ if (dn->dn_nlevels == 1) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ } else {
+ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+ int numbufs, err;
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_maxblkid == 0) {
+ int newsz = offset > dn->