diff options
author | Martin Matuska <mm@FreeBSD.org> | 2012-07-18 08:12:04 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2012-07-18 08:12:04 +0000 |
commit | af56e8c4b416d774961b41eee1eb349d657ebb8c (patch) | |
tree | e332d1e6089905f45302dedddb9967a87ade136a /uts | |
parent | 93a00b0821525e25814cd720fafd04d600811c28 (diff) | |
download | src-af56e8c4b416d774961b41eee1eb349d657ebb8c.tar.gz src-af56e8c4b416d774961b41eee1eb349d657ebb8c.zip |
Update vendor-sys/opensolaris to last OpenSolaris state (13149:b23a4dab3d50)vendor/opensolaris/20100818vendor/opensolaris
Add ZFS bits to vendor-sys/opensolaris
Obtained from: https://hg.openindiana.org/upstream/oracle/onnv-gate
Notes
Notes:
svn path=/vendor-sys/opensolaris/dist/; revision=238567
svn path=/vendor-sys/opensolaris/20100818/; revision=238568; tag=vendor/opensolaris/20100818
Diffstat (limited to 'uts')
213 files changed, 165021 insertions, 110 deletions
diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files new file mode 100644 index 000000000000..ec08410b4ff3 --- /dev/null +++ b/uts/common/Makefile.files @@ -0,0 +1,2007 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# This Makefile defines all file modules for the directory uts/common +# and its children. These are the source files which may be considered +# common to all SunOS systems. + +i386_CORE_OBJS += \ + atomic.o \ + avintr.o \ + pic.o + +sparc_CORE_OBJS += + +COMMON_CORE_OBJS += \ + beep.o \ + bitset.o \ + bp_map.o \ + brand.o \ + cpucaps.o \ + cmt.o \ + cmt_policy.o \ + cpu.o \ + cpu_event.o \ + cpu_intr.o \ + cpu_pm.o \ + cpupart.o \ + cap_util.o \ + disp.o \ + group.o \ + kstat_fr.o \ + iscsiboot_prop.o \ + lgrp.o \ + lgrp_topo.o \ + mmapobj.o \ + mutex.o \ + page_lock.o \ + page_retire.o \ + panic.o \ + param.o \ + pg.o \ + pghw.o \ + putnext.o \ + rctl_proc.o \ + rwlock.o \ + seg_kmem.o \ + softint.o \ + string.o \ + strtol.o \ + strtoul.o \ + strtoll.o \ + strtoull.o \ + thread_intr.o \ + vm_page.o \ + vm_pagelist.o \ + zlib_obj.o \ + clock_tick.o + +CORE_OBJS += $(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS) + +ZLIB_OBJS = zutil.o zmod.o zmod_subr.o \ + adler32.o crc32.o deflate.o inffast.o \ + inflate.o inftrees.o trees.o + +GENUNIX_OBJS += \ + access.o \ + acl.o \ + acl_common.o \ + adjtime.o \ + alarm.o \ + aio_subr.o \ + auditsys.o \ + audit_core.o \ + audit_zone.o \ + audit_memory.o \ + autoconf.o \ + avl.o \ + bdev_dsort.o \ + bio.o \ + bitmap.o \ + blabel.o \ + brandsys.o \ + bz2blocksort.o \ + bz2compress.o \ + bz2decompress.o \ + bz2randtable.o \ + bz2bzlib.o \ + bz2crctable.o \ + bz2huffman.o \ + callb.o \ + callout.o \ + chdir.o \ + chmod.o \ + chown.o \ + cladm.o \ + class.o \ + clock.o \ + clock_highres.o \ + clock_realtime.o\ + close.o \ + compress.o \ + condvar.o \ + conf.o \ + console.o \ + contract.o \ + copyops.o \ + core.o \ + corectl.o \ + cred.o \ + cs_stubs.o \ + dacf.o \ + dacf_clnt.o \ + damap.o \ + cyclic.o \ + ddi.o \ + ddifm.o \ + ddi_hp_impl.o \ + ddi_hp_ndi.o \ + ddi_intr.o \ + ddi_intr_impl.o \ + ddi_intr_irm.o \ + ddi_nodeid.o \ + ddi_timer.o \ + devcfg.o \ + devcache.o \ + device.o \ + devid.o \ + devid_cache.o \ + devid_scsi.o \ + devid_smp.o \ + devpolicy.o \ + disp_lock.o \ + dnlc.o \ + driver.o \ + dumpsubr.o \ + driver_lyr.o \ + dtrace_subr.o \ + errorq.o \ + etheraddr.o \ + evchannels.o \ + exacct.o \ + exacct_core.o \ + exec.o \ + exit.o \ + fbio.o \ + fcntl.o \ + fdbuffer.o \ + fdsync.o \ + fem.o \ + ffs.o \ + fio.o \ + flock.o \ + fm.o \ + fork.o \ + vpm.o \ + fs_reparse.o \ + fs_subr.o \ + fsflush.o \ + ftrace.o \ + getcwd.o \ + getdents.o \ + getloadavg.o \ + getpagesizes.o \ + getpid.o \ + gfs.o \ + rusagesys.o \ + gid.o \ + groups.o \ + grow.o \ + hat.o \ + hat_refmod.o \ + id32.o \ + id_space.o \ + inet_ntop.o \ + instance.o \ + ioctl.o \ + ip_cksum.o \ + issetugid.o \ + ippconf.o \ + kcpc.o \ + kdi.o \ + kiconv.o \ + klpd.o \ + kmem.o \ + ksyms_snapshot.o \ + l_strplumb.o \ + labelsys.o \ + link.o \ + list.o \ + lockstat_subr.o \ + log_sysevent.o \ + logsubr.o \ + lookup.o \ + lseek.o \ + ltos.o \ + lwp.o \ + lwp_create.o \ + lwp_info.o \ + lwp_self.o \ + lwp_sobj.o \ + lwp_timer.o \ + lwpsys.o \ + main.o \ + mmapobjsys.o \ + memcntl.o \ + memstr.o \ + lgrpsys.o \ + mkdir.o \ + mknod.o \ + mount.o \ + move.o \ + msacct.o \ + multidata.o \ + nbmlock.o \ + ndifm.o \ + nice.o \ + netstack.o \ + ntptime.o \ + nvpair.o \ + nvpair_alloc_system.o \ + nvpair_alloc_fixed.o \ + octet.o \ + open.o \ + p_online.o \ + pathconf.o \ + pathname.o \ + pause.o \ + serializer.o \ + pci_intr_lib.o \ + pci_cap.o \ + pcifm.o \ + pgrp.o \ + pgrpsys.o \ + pid.o \ + pkp_hash.o \ + policy.o \ + poll.o \ + pool.o \ + pool_pset.o \ + port_subr.o \ + ppriv.o \ + printf.o \ + priocntl.o \ + priv.o \ + priv_const.o \ + proc.o \ + procset.o \ + processor_bind.o \ + processor_info.o \ + profil.o \ + project.o \ + qsort.o \ + rctl.o \ + rctlsys.o \ + readlink.o \ + refstr.o \ + rename.o \ + resolvepath.o \ + retire_store.o \ + process.o \ + rlimit.o \ + rmap.o \ + rw.o \ + rwstlock.o \ + sad_conf.o \ + sid.o \ + sidsys.o \ + sched.o \ + schedctl.o \ + sctp_crc32.o \ + seg_dev.o \ + seg_kp.o \ + seg_kpm.o \ + seg_map.o \ + seg_vn.o \ + seg_spt.o \ + semaphore.o \ + sendfile.o \ + session.o \ + share.o \ + shuttle.o \ + sig.o \ + sigaction.o \ + sigaltstack.o \ + signotify.o \ + sigpending.o \ + sigprocmask.o \ + sigqueue.o \ + sigsendset.o \ + sigsuspend.o \ + sigtimedwait.o \ + sleepq.o \ + sock_conf.o \ + space.o \ + sscanf.o \ + stat.o \ + statfs.o \ + statvfs.o \ + stol.o \ + str_conf.o \ + strcalls.o \ + stream.o \ + streamio.o \ + strext.o \ + strsubr.o \ + strsun.o \ + subr.o \ + sunddi.o \ + sunmdi.o \ + sunndi.o \ + sunpci.o \ + sunpm.o \ + sundlpi.o \ + suntpi.o \ + swap_subr.o \ + swap_vnops.o \ + symlink.o \ + sync.o \ + sysclass.o \ + sysconfig.o \ + sysent.o \ + sysfs.o \ + systeminfo.o \ + task.o \ + taskq.o \ + tasksys.o \ + time.o \ + timer.o \ + times.o \ + timers.o \ + thread.o \ + tlabel.o \ + tnf_res.o \ + turnstile.o \ + tty_common.o \ + u8_textprep.o \ + uadmin.o \ + uconv.o \ + ucredsys.o \ + uid.o \ + umask.o \ + umount.o \ + uname.o \ + unix_bb.o \ + unlink.o \ + urw.o \ + utime.o \ + utssys.o \ + uucopy.o \ + vfs.o \ + vfs_conf.o \ + vmem.o \ + vm_anon.o \ + vm_as.o \ + vm_meter.o \ + vm_pageout.o \ + vm_pvn.o \ + vm_rm.o \ + vm_seg.o \ + vm_subr.o \ + vm_swap.o \ + vm_usage.o \ + vnode.o \ + vuid_queue.o \ + vuid_store.o \ + waitq.o \ + watchpoint.o \ + yield.o \ + scsi_confdata.o \ + xattr.o \ + xattr_common.o \ + xdr_mblk.o \ + xdr_mem.o \ + xdr.o \ + xdr_array.o \ + xdr_refer.o \ + xhat.o \ + zone.o + +# +# Stubs for the stand-alone linker/loader +# +sparc_GENSTUBS_OBJS = \ + kobj_stubs.o + +i386_GENSTUBS_OBJS = + +COMMON_GENSTUBS_OBJS = + +GENSTUBS_OBJS += $(COMMON_GENSTUBS_OBJS) $($(MACH)_GENSTUBS_OBJS) + +# +# DTrace and DTrace Providers +# +DTRACE_OBJS += dtrace.o dtrace_isa.o dtrace_asm.o + +SDT_OBJS += sdt_subr.o + +PROFILE_OBJS += profile.o + +SYSTRACE_OBJS += systrace.o + +LOCKSTAT_OBJS += lockstat.o + +FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o + +DCPC_OBJS += dcpc.o + +# +# Driver (pseudo-driver) Modules +# +IPP_OBJS += ippctl.o + +AUDIO_OBJS += audio_client.o audio_ddi.o audio_engine.o \ + audio_fltdata.o audio_format.o audio_ctrl.o \ + audio_grc3.o audio_output.o audio_input.o \ + audio_oss.o audio_sun.o + +AUDIOEMU10K_OBJS += audioemu10k.o + +AUDIOENS_OBJS += audioens.o + +AUDIOVIA823X_OBJS += audiovia823x.o + +AUDIOVIA97_OBJS += audiovia97.o + +AUDIO1575_OBJS += audio1575.o + +AUDIO810_OBJS += audio810.o + +AUDIOCMI_OBJS += audiocmi.o + +AUDIOHD_OBJS += audiohd.o + +AUDIOIXP_OBJS += audioixp.o + +AUDIOLS_OBJS += audiols.o + +AUDIOP16X_OBJS += audiop16x.o + +AUDIOPCI_OBJS += audiopci.o + +AUDIOSOLO_OBJS += audiosolo.o + +AUDIOTS_OBJS += audiots.o + +AC97_OBJS += ac97.o ac97_ad.o ac97_alc.o ac97_cmi.o + +BLKDEV_OBJS += blkdev.o + +CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o + +CONSKBD_OBJS += conskbd.o + +CONSMS_OBJS += consms.o + +OLDPTY_OBJS += tty_ptyconf.o + +PTC_OBJS += tty_pty.o + +PTSL_OBJS += tty_pts.o + +PTM_OBJS += ptm.o + +MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ + mii_marvell.o mii_realtek.o mii_other.o + +PTS_OBJS += pts.o + +PTY_OBJS += ptms_conf.o + +SAD_OBJS += sad.o + +MD4_OBJS += md4.o md4_mod.o + +MD5_OBJS += md5.o md5_mod.o + +SHA1_OBJS += sha1.o sha1_mod.o fips_sha1_util.o + +SHA2_OBJS += sha2.o sha2_mod.o fips_sha2_util.o + +IPGPC_OBJS += classifierddi.o classifier.o filters.o trie.o table.o \ + ba_table.o + +DSCPMK_OBJS += dscpmk.o dscpmkddi.o + +DLCOSMK_OBJS += dlcosmk.o dlcosmkddi.o + +FLOWACCT_OBJS += flowacctddi.o flowacct.o + +TOKENMT_OBJS += tokenmt.o tokenmtddi.o + +TSWTCL_OBJS += tswtcl.o tswtclddi.o + +ARP_OBJS += arpddi.o + +ICMP_OBJS += icmpddi.o + +ICMP6_OBJS += icmp6ddi.o + +RTS_OBJS += rtsddi.o + +IP_ICMP_OBJS = icmp.o icmp_opt_data.o +IP_RTS_OBJS = rts.o rts_opt_data.o +IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_opt_data.o tcp_sack.o tcp_stats.o \ + tcp_misc.o tcp_timers.o tcp_time_wait.o tcp_tpi.o tcp_output.o \ + tcp_input.o tcp_socket.o tcp_bind.o tcp_cluster.o tcp_tunables.o +IP_UDP_OBJS = udp.o udp_opt_data.o udp_tunables.o udp_stats.o +IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ + sctp_init.o sctp_input.o sctp_cookie.o \ + sctp_conn.o sctp_error.o sctp_snmp.o \ + sctp_tunables.o sctp_shutdown.o sctp_common.o \ + sctp_timer.o sctp_heartbeat.o sctp_hash.o \ + sctp_bind.o sctp_notify.o sctp_asconf.o \ + sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \ + sctp_misc.o +IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o + +IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ + ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ + ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \ + ipddi.o ipdrop.o mi.o nd.o tunables.o optcom.o snmpcom.o \ + ipsec_loader.o spd.o ipclassifier.o inet_common.o ip_squeue.o \ + squeue.o ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \ + ip_helper_stream.o ip_tunables.o \ + ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \ + conn_opt.o ip_attr.o ip_dce.o \ + $(IP_ICMP_OBJS) \ + $(IP_RTS_OBJS) \ + $(IP_TCP_OBJS) \ + $(IP_UDP_OBJS) \ + $(IP_SCTP_OBJS) \ + $(IP_ILB_OBJS) + +IP6_OBJS += ip6ddi.o + +HOOK_OBJS += hook.o + +NETI_OBJS += neti_impl.o neti_mod.o neti_stack.o + +KEYSOCK_OBJS += keysockddi.o keysock.o keysock_opt_data.o + +IPNET_OBJS += ipnet.o ipnet_bpf.o + +SPDSOCK_OBJS += spdsockddi.o spdsock.o spdsock_opt_data.o + +IPSECESP_OBJS += ipsecespddi.o ipsecesp.o + +IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o + +SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o + +SPPPTUN_OBJS += sppptun.o sppptun_mod.o + +SPPPASYN_OBJS += spppasyn.o spppasyn_mod.o + +SPPPCOMP_OBJS += spppcomp.o spppcomp_mod.o deflate.o bsd-comp.o vjcompress.o \ + zlib.o + +TCP_OBJS += tcpddi.o + +TCP6_OBJS += tcp6ddi.o + +NCA_OBJS += ncaddi.o + +SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o + +SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o + +PFP_SOCK_MOD_OBJS += sockmod_pfp.o + +RDS_SOCK_MOD_OBJS += sockmod_rds.o + +RDS_OBJS += rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o + +RDSIB_OBJS += rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \ + rdsib_debug.o rdsib_sc.o + +RDSV3_OBJS += af_rds.o rdsv3_ddi.o bind.o loop.o threads.o connection.o \ + transport.o cong.o sysctl.o message.o rds_recv.o send.o \ + stats.o info.o page.o rdma_transport.o ib_ring.o ib_rdma.o \ + ib_recv.o ib.o ib_send.o ib_sysctl.o ib_stats.o ib_cm.o \ + rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o rdsv3_af_thr.o + +ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \ + iser_resource.o iser_xfer.o + +UDP_OBJS += udpddi.o + +UDP6_OBJS += udp6ddi.o + +SY_OBJS += gentty.o + +TCO_OBJS += ticots.o + +TCOO_OBJS += ticotsord.o + +TCL_OBJS += ticlts.o + +TL_OBJS += tl.o + +DUMP_OBJS += dump.o + +BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o + +CLONE_OBJS += clone.o + +CN_OBJS += cons.o + +DLD_OBJS += dld_drv.o dld_proto.o dld_str.o dld_flow.o + +DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o + +GLD_OBJS += gld.o gldutil.o + +MAC_OBJS += mac.o mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \ + mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \ + mac_protect.o mac_soft_ring.o mac_stat.o mac_util.o + +MAC_6TO4_OBJS += mac_6to4.o + +MAC_ETHER_OBJS += mac_ether.o + +MAC_IPV4_OBJS += mac_ipv4.o + +MAC_IPV6_OBJS += mac_ipv6.o + +MAC_WIFI_OBJS += mac_wifi.o + +MAC_IB_OBJS += mac_ib.o + +IPTUN_OBJS += iptun_dev.o iptun_ctl.o iptun.o + +AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \ + aggr_send.o aggr_recv.o aggr_lacp.o + +SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \ + softmac_dev.o softmac_stat.o softmac_pkt.o softmac_fp.o + +NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ + net80211_output.o net80211_node.o net80211_crypto.o \ + net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \ + net80211_crypto_tkip.o net80211_crypto_ccmp.o \ + net80211_ht.o + +VNIC_OBJS += vnic_ctl.o vnic_dev.o + +SIMNET_OBJS += simnet.o + +IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o + +IBCM_OBJS += ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \ + ibcm_arp.o ibcm_arp_link.o + +IBDM_OBJS += ibdm.o + +IBDMA_OBJS += ibdma.o + +IBMF_OBJS += ibmf.o ibmf_impl.o ibmf_dr.o ibmf_wqe.o ibmf_ud_dest.o ibmf_mod.o \ + ibmf_send.o ibmf_recv.o ibmf_handlers.o ibmf_trans.o \ + ibmf_timers.o ibmf_msg.o ibmf_utils.o ibmf_rmpp.o \ + ibmf_saa.o ibmf_saa_impl.o ibmf_saa_utils.o ibmf_saa_events.o + +IBTL_OBJS += ibtl_impl.o ibtl_util.o ibtl_mem.o ibtl_handlers.o ibtl_qp.o \ + ibtl_cq.o ibtl_wr.o ibtl_hca.o ibtl_chan.o ibtl_cm.o \ + ibtl_mcg.o ibtl_ibnex.o ibtl_srq.o ibtl_part.o + +TAVOR_OBJS += tavor.o tavor_agents.o tavor_cfg.o tavor_ci.o tavor_cmd.o \ + tavor_cq.o tavor_event.o tavor_ioctl.o tavor_misc.o \ + tavor_mr.o tavor_qp.o tavor_qpmod.o tavor_rsrc.o \ + tavor_srq.o tavor_stats.o tavor_umap.o tavor_wr.o + +HERMON_OBJS += hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \ + hermon_cq.o hermon_event.o hermon_ioctl.o hermon_misc.o \ + hermon_mr.o hermon_qp.o hermon_qpmod.o hermon_rsrc.o \ + hermon_srq.o hermon_stats.o hermon_umap.o hermon_wr.o \ + hermon_fcoib.o hermon_fm.o + +DAPLT_OBJS += daplt.o + +SOL_OFS_OBJS += sol_cma.o sol_ib_cma.o sol_uobj.o \ + sol_ofs_debug_util.o sol_ofs_gen_util.o \ + sol_kverbs.o + +SOL_UCMA_OBJS += sol_ucma.o + +SOL_UVERBS_OBJS += sol_uverbs.o sol_uverbs_comp.o sol_uverbs_event.o \ + sol_uverbs_hca.o sol_uverbs_qp.o + +SOL_UMAD_OBJS += sol_umad.o + +KSTAT_OBJS += kstat.o + +KSYMS_OBJS += ksyms.o + +INSTANCE_OBJS += inst_sync.o + +IWSCN_OBJS += iwscons.o + +LOFI_OBJS += lofi.o LzmaDec.o + +FSSNAP_OBJS += fssnap.o + +FSSNAPIF_OBJS += fssnap_if.o + +MM_OBJS += mem.o + +PHYSMEM_OBJS += physmem.o + +OPTIONS_OBJS += options.o + +WINLOCK_OBJS += winlockio.o + +PM_OBJS += pm.o +SRN_OBJS += srn.o + +PSEUDO_OBJS += pseudonex.o + +RAMDISK_OBJS += ramdisk.o + +LLC1_OBJS += llc1.o + +USBKBM_OBJS += usbkbm.o + +USBWCM_OBJS += usbwcm.o + +BOFI_OBJS += bofi.o + +HID_OBJS += hid.o + +HWA_RC_OBJS += hwarc.o + +USBSKEL_OBJS += usbskel.o + +USBVC_OBJS += usbvc.o usbvc_v4l2.o + +HIDPARSER_OBJS += hidparser.o + +USB_AC_OBJS += usb_ac.o + +USB_AS_OBJS += usb_as.o + +USB_AH_OBJS += usb_ah.o + +USBMS_OBJS += usbms.o + +USBPRN_OBJS += usbprn.o + +UGEN_OBJS += ugen.o + +USBSER_OBJS += usbser.o usbser_rseq.o + +USBSACM_OBJS += usbsacm.o + +USBSER_KEYSPAN_OBJS += usbser_keyspan.o keyspan_dsd.o keyspan_pipe.o + +USBS49_FW_OBJS += keyspan_49fw.o + +USBSPRL_OBJS += usbser_pl2303.o pl2303_dsd.o + +WUSB_CA_OBJS += wusb_ca.o + +USBFTDI_OBJS += usbser_uftdi.o uftdi_dsd.o + +USBECM_OBJS += usbecm.o + +WC_OBJS += wscons.o vcons.o + +VCONS_CONF_OBJS += vcons_conf.o + +SCSI_OBJS += scsi_capabilities.o scsi_confsubr.o scsi_control.o \ + scsi_data.o scsi_fm.o scsi_hba.o scsi_reset_notify.o \ + scsi_resource.o scsi_subr.o scsi_transport.o scsi_watch.o \ + smp_transport.o + +SCSI_VHCI_OBJS += scsi_vhci.o mpapi_impl.o scsi_vhci_tpgs.o + +SCSI_VHCI_F_SYM_OBJS += sym.o + +SCSI_VHCI_F_TPGS_OBJS += tpgs.o + +SCSI_VHCI_F_ASYM_SUN_OBJS += asym_sun.o + +SCSI_VHCI_F_SYM_HDS_OBJS += sym_hds.o + +SCSI_VHCI_F_TAPE_OBJS += tape.o + +SCSI_VHCI_F_TPGS_TAPE_OBJS += tpgs_tape.o + +SGEN_OBJS += sgen.o + +SMP_OBJS += smp.o + +SATA_OBJS += sata.o + +USBA_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \ + usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \ + usba_devdb.o usba10_calls.o usba_ugen.o whcdi.o wa.o +USBA_WITHOUT_WUSB_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \ + usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \ + usba_devdb.o usba10_calls.o usba_ugen.o + +USBA10_OBJS += usba10.o + +RSM_OBJS += rsm.o rsmka_pathmanager.o rsmka_util.o + +RSMOPS_OBJS += rsmops.o + +S1394_OBJS += t1394.o t1394_errmsg.o s1394.o s1394_addr.o s1394_asynch.o \ + s1394_bus_reset.o s1394_cmp.o s1394_csr.o s1394_dev_disc.o \ + s1394_fa.o s1394_fcp.o \ + s1394_hotplug.o s1394_isoch.o s1394_misc.o h1394.o nx1394.o + +HCI1394_OBJS += hci1394.o hci1394_async.o hci1394_attach.o hci1394_buf.o \ + hci1394_csr.o hci1394_detach.o hci1394_extern.o \ + hci1394_ioctl.o hci1394_isoch.o hci1394_isr.o \ + hci1394_ixl_comp.o hci1394_ixl_isr.o hci1394_ixl_misc.o \ + hci1394_ixl_update.o hci1394_misc.o hci1394_ohci.o \ + hci1394_q.o hci1394_s1394if.o hci1394_tlabel.o \ + hci1394_tlist.o hci1394_vendor.o + +AV1394_OBJS += av1394.o av1394_as.o av1394_async.o av1394_cfgrom.o \ + av1394_cmp.o av1394_fcp.o av1394_isoch.o av1394_isoch_chan.o \ + av1394_isoch_recv.o av1394_isoch_xmit.o av1394_list.o \ + av1394_queue.o + +DCAM1394_OBJS += dcam.o dcam_frame.o dcam_param.o dcam_reg.o \ + dcam_ring_buff.o + +SCSA1394_OBJS += hba.o sbp2_driver.o sbp2_bus.o + +SBP2_OBJS += cfgrom.o sbp2.o + +PMODEM_OBJS += pmodem.o pmodem_cis.o cis.o cis_callout.o cis_handlers.o cis_params.o + +DSW_OBJS += dsw.o dsw_dev.o ii_tree.o + +NCALL_OBJS += ncall.o \ + ncall_stub.o + +RDC_OBJS += rdc.o \ + rdc_dev.o \ + rdc_io.o \ + rdc_clnt.o \ + rdc_prot_xdr.o \ + rdc_svc.o \ + rdc_bitmap.o \ + rdc_health.o \ + rdc_subr.o \ + rdc_diskq.o + +RDCSRV_OBJS += rdcsrv.o + +RDCSTUB_OBJS += rdc_stub.o + +SDBC_OBJS += sd_bcache.o \ + sd_bio.o \ + sd_conf.o \ + sd_ft.o \ + sd_hash.o \ + sd_io.o \ + sd_misc.o \ + sd_pcu.o \ + sd_tdaemon.o \ + sd_trace.o \ + sd_iob_impl0.o \ + sd_iob_impl1.o \ + sd_iob_impl2.o \ + sd_iob_impl3.o \ + sd_iob_impl4.o \ + sd_iob_impl5.o \ + sd_iob_impl6.o \ + sd_iob_impl7.o \ + safestore.o \ + safestore_ram.o + +NSCTL_OBJS += nsctl.o \ + nsc_cache.o \ + nsc_disk.o \ + nsc_dev.o \ + nsc_freeze.o \ + nsc_gen.o \ + nsc_mem.o \ + nsc_ncallio.o \ + nsc_power.o \ + nsc_resv.o \ + nsc_rmspin.o \ + nsc_solaris.o \ + nsc_trap.o \ + nsc_list.o +UNISTAT_OBJS += spuni.o \ + spcs_s_k.o + +NSKERN_OBJS += nsc_ddi.o \ + nsc_proc.o \ + nsc_raw.o \ + nsc_thread.o \ + nskernd.o + +SV_OBJS += sv.o + +PMCS_OBJS += pmcs_attach.o pmcs_ds.o pmcs_intr.o pmcs_nvram.o pmcs_sata.o \ + pmcs_scsa.o pmcs_smhba.o pmcs_subr.o pmcs_fwlog.o + +PMCS8001FW_C_OBJS += pmcs_fw_hdr.o +PMCS8001FW_OBJS += $(PMCS8001FW_C_OBJS) SPCBoot.o ila.o firmware.o + +# +# Build up defines and paths. + +ST_OBJS += st.o st_conf.o + +EMLXS_OBJS += emlxs_clock.o emlxs_dfc.o emlxs_dhchap.o emlxs_diag.o \ + emlxs_download.o emlxs_dump.o emlxs_els.o emlxs_event.o \ + emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \ + emlxs_mbox.o emlxs_mem.o emlxs_msg.o emlxs_node.o \ + emlxs_pkt.o emlxs_sli3.o emlxs_sli4.o emlxs_solaris.o \ + emlxs_thread.o + +EMLXS_FW_OBJS += emlxs_fw.o + +OCE_OBJS += oce_buf.o oce_fm.o oce_gld.o oce_hw.o oce_intr.o oce_main.o \ + oce_mbx.o oce_mq.o oce_queue.o oce_rx.o oce_stat.o oce_tx.o \ + oce_utils.o + +FCT_OBJS += discovery.o fct.o + +QLT_OBJS += 2400.o 2500.o 8100.o qlt.o qlt_dma.o + +SRPT_OBJS += srpt_mod.o srpt_ch.o srpt_cm.o srpt_ioc.o srpt_stp.o + +FCOE_OBJS += fcoe.o fcoe_eth.o fcoe_fc.o + +FCOET_OBJS += fcoet.o fcoet_eth.o fcoet_fc.o + +FCOEI_OBJS += fcoei.o fcoei_eth.o fcoei_lv.o + +ISCSIT_SHARED_OBJS += \ + iscsit_common.o + +ISCSIT_OBJS += $(ISCSIT_SHARED_OBJS) \ + iscsit.o iscsit_tgt.o iscsit_sess.o iscsit_login.o \ + iscsit_text.o iscsit_isns.o iscsit_radiusauth.o \ + iscsit_radiuspacket.o iscsit_auth.o iscsit_authclient.o + +PPPT_OBJS += alua_ic_if.o pppt.o pppt_msg.o pppt_tgt.o + +STMF_OBJS += lun_map.o stmf.o + +STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o sbd_zvol.o + +SYSMSG_OBJS += sysmsg.o + +SES_OBJS += ses.o ses_sen.o ses_safte.o ses_ses.o + +TNF_OBJS += tnf_buf.o tnf_trace.o tnf_writer.o trace_init.o \ + trace_funcs.o tnf_probe.o tnf.o + +LOGINDMUX_OBJS += logindmux.o + +DEVINFO_OBJS += devinfo.o + +DEVPOLL_OBJS += devpoll.o + +DEVPOOL_OBJS += devpool.o + +I8042_OBJS += i8042.o + +KB8042_OBJS += \ + at_keyprocess.o \ + kb8042.o \ + kb8042_keytables.o + +MOUSE8042_OBJS += mouse8042.o + +FDC_OBJS += fdc.o + +ASY_OBJS += asy.o + +ECPP_OBJS += ecpp.o + +VUIDM3P_OBJS += vuidmice.o vuidm3p.o + +VUIDM4P_OBJS += vuidmice.o vuidm4p.o + +VUIDM5P_OBJS += vuidmice.o vuidm5p.o + +VUIDPS2_OBJS += vuidmice.o vuidps2.o + +HPCSVC_OBJS += hpcsvc.o + +PCIE_MISC_OBJS += pcie.o pcie_fault.o pcie_hp.o pciehpc.o pcishpc.o pcie_pwr.o pciev.o + +PCIHPNEXUS_OBJS += pcihp.o + +OPENEEPR_OBJS += openprom.o + +RANDOM_OBJS += random.o + +PSHOT_OBJS += pshot.o + +GEN_DRV_OBJS += gen_drv.o + +TCLIENT_OBJS += tclient.o + +TPHCI_OBJS += tphci.o + +TVHCI_OBJS += tvhci.o + +EMUL64_OBJS += emul64.o emul64_bsd.o + +FCP_OBJS += fcp.o + +FCIP_OBJS += fcip.o + +FCSM_OBJS += fcsm.o + +FCTL_OBJS += fctl.o + +FP_OBJS += fp.o + +QLC_OBJS += ql_api.o ql_debug.o ql_hba_fru.o ql_init.o ql_iocb.o ql_ioctl.o \ + ql_isr.o ql_mbx.o ql_nx.o ql_xioctl.o ql_fw_table.o + +QLC_FW_2200_OBJS += ql_fw_2200.o + +QLC_FW_2300_OBJS += ql_fw_2300.o + +QLC_FW_2400_OBJS += ql_fw_2400.o + +QLC_FW_2500_OBJS += ql_fw_2500.o + +QLC_FW_6322_OBJS += ql_fw_6322.o + +QLC_FW_8100_OBJS += ql_fw_8100.o + +QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o + +ZCONS_OBJS += zcons.o + +NV_SATA_OBJS += nv_sata.o + +SI3124_OBJS += si3124.o + +AHCI_OBJS += ahci.o + +PCIIDE_OBJS += pci-ide.o + +PCEPP_OBJS += pcepp.o + +CPC_OBJS += cpc.o + +CPUID_OBJS += cpuid_drv.o + +SYSEVENT_OBJS += sysevent.o + +BL_OBJS += bl.o + +DRM_OBJS += drm_sunmod.o drm_kstat.o drm_agpsupport.o \ + drm_auth.o drm_bufs.o drm_context.o drm_dma.o \ + drm_drawable.o drm_drv.o drm_fops.o drm_ioctl.o drm_irq.o \ + drm_lock.o drm_memory.o drm_msg.o drm_pci.o drm_scatter.o \ + drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o + +FM_OBJS += devfm.o devfm_machdep.o + +RTLS_OBJS += rtls.o + +# +# exec modules +# +AOUTEXEC_OBJS +=aout.o + +ELFEXEC_OBJS += elf.o elf_notes.o old_notes.o + +INTPEXEC_OBJS +=intp.o + +SHBINEXEC_OBJS +=shbin.o + +JAVAEXEC_OBJS +=java.o + +# +# file system modules +# +AUTOFS_OBJS += auto_vfsops.o auto_vnops.o auto_subr.o auto_xdr.o auto_sys.o + +CACHEFS_OBJS += cachefs_cnode.o cachefs_cod.o \ + cachefs_dir.o cachefs_dlog.o cachefs_filegrp.o \ + cachefs_fscache.o cachefs_ioctl.o cachefs_log.o \ + cachefs_module.o \ + cachefs_noopc.o cachefs_resource.o \ + cachefs_strict.o \ + cachefs_subr.o cachefs_vfsops.o \ + cachefs_vnops.o + +DCFS_OBJS += dc_vnops.o + +DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o + +DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \ + sdev_ptsops.o sdev_zvolops.o sdev_comm.o \ + sdev_profile.o sdev_ncache.o sdev_netops.o \ + sdev_ipnetops.o \ + sdev_vtops.o + +CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \ + ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o + +OBJFS_OBJS += objfs_vfs.o objfs_root.o objfs_common.o \ + objfs_odir.o objfs_data.o + +FDFS_OBJS += fdops.o + +FIFO_OBJS += fifosubr.o fifovnops.o + +PIPE_OBJS += pipe.o + +HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ + hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o + +LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o + +NAMEFS_OBJS += namevfs.o namevno.o + +NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ + nfs_subr.o nfs_vfsops.o nfs_vnops.o \ + nfs_xdr.o nfs_sys.o nfs_strerror.o \ + nfs3_vfsops.o nfs3_vnops.o nfs3_xdr.o \ + nfs_acl_vnops.o nfs_acl_xdr.o nfs4_vfsops.o \ + nfs4_vnops.o nfs4_xdr.o nfs4_idmap.o \ + nfs4_shadow.o nfs4_subr.o \ + nfs4_attr.o nfs4_rnode.o nfs4_client.o \ + nfs4_acache.o nfs4_common.o nfs4_client_state.o \ + nfs4_callback.o nfs4_recovery.o nfs4_client_secinfo.o \ + nfs4_client_debug.o nfs_stats.o \ + nfs4_acl.o nfs4_stub_vnops.o nfs_cmd.o + +NFSSRV_OBJS += nfs_server.o nfs_srv.o nfs3_srv.o \ + nfs_acl_srv.o nfs_auth.o nfs_auth_xdr.o \ + nfs_export.o nfs_log.o nfs_log_xdr.o \ + nfs4_srv.o nfs4_state.o nfs4_srv_attr.o \ + nfs4_srv_ns.o nfs4_db.o nfs4_srv_deleg.o \ + nfs4_deleg_ops.o nfs4_srv_readdir.o nfs4_dispatch.o + +SMBSRV_SHARED_OBJS += \ + smb_inet.o \ + smb_match.o \ + smb_msgbuf.o \ + smb_oem.o \ + smb_string.o \ + smb_utf8.o \ + smb_door_legacy.o \ + smb_xdr.o \ + smb_token.o \ + smb_token_xdr.o \ + smb_sid.o \ + smb_native.o \ + smb_netbios_util.o + +SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ + smb_acl.o \ + smb_alloc.o \ + smb_close.o \ + smb_common_open.o \ + smb_common_transact.o \ + smb_create.o \ + smb_delete.o \ + smb_directory.o \ + smb_dispatch.o \ + smb_echo.o \ + smb_fem.o \ + smb_find.o \ + smb_flush.o \ + smb_fsinfo.o \ + smb_fsops.o \ + smb_init.o \ + smb_kdoor.o \ + smb_kshare.o \ + smb_kutil.o \ + smb_lock.o \ + smb_lock_byte_range.o \ + smb_locking_andx.o \ + smb_logoff_andx.o \ + smb_mangle_name.o \ + smb_mbuf_marshaling.o \ + smb_mbuf_util.o \ + smb_negotiate.o \ + smb_net.o \ + smb_node.o \ + smb_nt_cancel.o \ + smb_nt_create_andx.o \ + smb_nt_transact_create.o \ + smb_nt_transact_ioctl.o \ + smb_nt_transact_notify_change.o \ + smb_nt_transact_quota.o \ + smb_nt_transact_security.o \ + smb_odir.o \ + smb_ofile.o \ + smb_open_andx.o \ + smb_opipe.o \ + smb_oplock.o \ + smb_pathname.o \ + smb_print.o \ + smb_process_exit.o \ + smb_query_fileinfo.o \ + smb_read.o \ + smb_rename.o \ + smb_sd.o \ + smb_seek.o \ + smb_server.o \ + smb_session.o \ + smb_session_setup_andx.o \ + smb_set_fileinfo.o \ + smb_signing.o \ + smb_tree.o \ + smb_trans2_create_directory.o \ + smb_trans2_dfs.o \ + smb_trans2_find.o \ + smb_tree_connect.o \ + smb_unlock_byte_range.o \ + smb_user.o \ + smb_vfs.o \ + smb_vops.o \ + smb_vss.o \ + smb_write.o \ + smb_write_raw.o + +PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ + pc_vfsops.o pc_vnops.o + +PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ + prvfsops.o prvnops.o + +MNTFS_OBJS += mntvfsops.o mntvnops.o + +SHAREFS_OBJS += sharetab.o sharefs_vfsops.o sharefs_vnops.o + +SPEC_OBJS += specsubr.o specvfsops.o specvnops.o + +SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \ + socksyscalls.o socktpi.o sockstr.o \ + sockcommon_vnops.o sockcommon_subr.o \ + sockcommon_sops.o sockcommon.o \ + sock_notsupp.o socknotify.o \ + nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \ + nl7cnca.o sodirect.o sockfilter.o + +TMPFS_OBJS += tmp_dir.o tmp_subr.o tmp_tnode.o tmp_vfsops.o \ + tmp_vnops.o + +UDFS_OBJS += udf_alloc.o udf_bmap.o udf_dir.o \ + udf_inode.o udf_subr.o udf_vfsops.o \ + udf_vnops.o + +UFS_OBJS += ufs_alloc.o ufs_bmap.o ufs_dir.o ufs_xattr.o \ + ufs_inode.o ufs_subr.o ufs_tables.o ufs_vfsops.o \ + ufs_vnops.o quota.o quotacalls.o quota_ufs.o \ + ufs_filio.o ufs_lockfs.o ufs_thread.o ufs_trans.o \ + ufs_acl.o ufs_panic.o ufs_directio.o ufs_log.o \ + ufs_extvnops.o ufs_snap.o lufs.o lufs_thread.o \ + lufs_log.o lufs_map.o lufs_top.o lufs_debug.o +VSCAN_OBJS += vscan_drv.o vscan_svc.o vscan_door.o + +NSMB_OBJS += smb_conn.o smb_dev.o smb_iod.o smb_pass.o \ + smb_rq.o smb_sign.o smb_smb.o smb_subrs.o \ + smb_time.o smb_tran.o smb_trantcp.o smb_usr.o \ + subr_mchain.o + +SMBFS_COMMON_OBJS += smbfs_ntacl.o +SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \ + smbfs_acl.o smbfs_client.o smbfs_smb.o \ + smbfs_subr.o smbfs_subr2.o \ + smbfs_rwlock.o smbfs_xattr.o \ + $(SMBFS_COMMON_OBJS) + + +# +# LVM modules +# +MD_OBJS += md.o md_error.o md_ioctl.o md_mddb.o md_names.o \ + md_med.o md_rename.o md_subr.o + +MD_COMMON_OBJS = md_convert.o md_crc.o md_revchk.o + +MD_DERIVED_OBJS = metamed_xdr.o meta_basic_xdr.o + +SOFTPART_OBJS += sp.o sp_ioctl.o + +STRIPE_OBJS += stripe.o stripe_ioctl.o + +HOTSPARES_OBJS += hotspares.o + +RAID_OBJS += raid.o raid_ioctl.o raid_replay.o raid_resync.o raid_hotspare.o + +MIRROR_OBJS += mirror.o mirror_ioctl.o mirror_resync.o + +NOTIFY_OBJS += md_notify.o + +TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o + +ZFS_COMMON_OBJS += \ + arc.o \ + bplist.o \ + bpobj.o \ + dbuf.o \ + ddt.o \ + ddt_zap.o \ + dmu.o \ + dmu_diff.o \ + dmu_send.o \ + dmu_object.o \ + dmu_objset.o \ + dmu_traverse.o \ + dmu_tx.o \ + dnode.o \ + dnode_sync.o \ + dsl_dir.o \ + dsl_dataset.o \ + dsl_deadlist.o \ + dsl_pool.o \ + dsl_synctask.o \ + dmu_zfetch.o \ + dsl_deleg.o \ + dsl_prop.o \ + dsl_scan.o \ + gzip.o \ + lzjb.o \ + metaslab.o \ + refcount.o \ + sa.o \ + sha256.o \ + spa.o \ + spa_config.o \ + spa_errlog.o \ + spa_history.o \ + spa_misc.o \ + space_map.o \ + txg.o \ + uberblock.o \ + unique.o \ + vdev.o \ + vdev_cache.o \ + vdev_file.o \ + vdev_label.o \ + vdev_mirror.o \ + vdev_missing.o \ + vdev_queue.o \ + vdev_raidz.o \ + vdev_root.o \ + zap.o \ + zap_leaf.o \ + zap_micro.o \ + zfs_byteswap.o \ + zfs_debug.o \ + zfs_fm.o \ + zfs_fuid.o \ + zfs_sa.o \ + zfs_znode.o \ + zil.o \ + zio.o \ + zio_checksum.o \ + zio_compress.o \ + zio_inject.o \ + zle.o \ + zrlock.o + +ZFS_SHARED_OBJS += \ + zfs_namecheck.o \ + zfs_deleg.o \ + zfs_prop.o \ + zfs_comutil.o \ + zfs_fletcher.o \ + zpool_prop.o \ + zprop_common.o + +ZFS_OBJS += \ + $(ZFS_COMMON_OBJS) \ + $(ZFS_SHARED_OBJS) \ + vdev_disk.o \ + zfs_acl.o \ + zfs_ctldir.o \ + zfs_dir.o \ + zfs_ioctl.o \ + zfs_log.o \ + zfs_onexit.o \ + zfs_replay.o \ + zfs_rlock.o \ + rrwlock.o \ + zfs_vfsops.o \ + zfs_vnops.o \ + zvol.o + +ZUT_OBJS += \ + zut.o + +# +# streams modules +# +BUFMOD_OBJS += bufmod.o + +CONNLD_OBJS += connld.o + +DEDUMP_OBJS += dedump.o + +DRCOMPAT_OBJS += drcompat.o + +LDLINUX_OBJS += ldlinux.o + +LDTERM_OBJS += ldterm.o uwidth.o + +PCKT_OBJS += pckt.o + +PFMOD_OBJS += pfmod.o + +PTEM_OBJS += ptem.o + +REDIRMOD_OBJS += strredirm.o + +TIMOD_OBJS += timod.o + +TIRDWR_OBJS += tirdwr.o + +TTCOMPAT_OBJS +=ttcompat.o + +LOG_OBJS += log.o + +PIPEMOD_OBJS += pipemod.o + +RPCMOD_OBJS += rpcmod.o clnt_cots.o clnt_clts.o \ + clnt_gen.o clnt_perr.o mt_rpcinit.o rpc_calmsg.o \ + rpc_prot.o rpc_sztypes.o rpc_subr.o rpcb_prot.o \ + svc.o svc_clts.o svc_gen.o svc_cots.o \ + rpcsys.o xdr_sizeof.o clnt_rdma.o svc_rdma.o \ + xdr_rdma.o rdma_subr.o xdrrdma_sizeof.o + +TLIMOD_OBJS += tlimod.o t_kalloc.o t_kbind.o t_kclose.o \ + t_kconnect.o t_kfree.o t_kgtstate.o t_kopen.o \ + t_krcvudat.o t_ksndudat.o t_kspoll.o t_kunbind.o \ + t_kutil.o + +RLMOD_OBJS += rlmod.o + +TELMOD_OBJS += telmod.o + +CRYPTMOD_OBJS += cryptmod.o + +KB_OBJS += kbd.o keytables.o + +# +# ID mapping module +# +IDMAP_OBJS += idmap_mod.o idmap_kapi.o idmap_xdr.o idmap_cache.o + +# +# scheduling class modules +# +SDC_OBJS += sysdc.o + +RT_OBJS += rt.o +RT_DPTBL_OBJS += rt_dptbl.o + +TS_OBJS += ts.o +TS_DPTBL_OBJS += ts_dptbl.o + +IA_OBJS += ia.o + +FSS_OBJS += fss.o + +FX_OBJS += fx.o +FX_DPTBL_OBJS += fx_dptbl.o + +# +# Inter-Process Communication (IPC) modules +# +IPC_OBJS += ipc.o + +IPCMSG_OBJS += msg.o + +IPCSEM_OBJS += sem.o + +IPCSHM_OBJS += shm.o + +# +# bignum module +# +COMMON_BIGNUM_OBJS += bignum_mod.o bignumimpl.o + +BIGNUM_OBJS += $(COMMON_BIGNUM_OBJS) $(BIGNUM_PSR_OBJS) + +# +# kernel cryptographic framework +# +KCF_OBJS += kcf.o kcf_callprov.o kcf_cbufcall.o kcf_cipher.o kcf_crypto.o \ + kcf_cryptoadm.o kcf_ctxops.o kcf_digest.o kcf_dual.o \ + kcf_keys.o kcf_mac.o kcf_mech_tabs.o kcf_miscapi.o \ + kcf_object.o kcf_policy.o kcf_prov_lib.o kcf_prov_tabs.o \ + kcf_sched.o kcf_session.o kcf_sign.o kcf_spi.o kcf_verify.o \ + kcf_random.o modes.o ecb.o cbc.o ctr.o ccm.o gcm.o \ + fips_random.o fips_checksum.o fips_test_vectors.o + +CRYPTOADM_OBJS += cryptoadm.o + +CRYPTO_OBJS += crypto.o + +DPROV_OBJS += dprov.o + +DCA_OBJS += dca.o dca_3des.o dca_debug.o dca_dsa.o dca_kstat.o dca_rng.o \ + dca_rsa.o + +AESPROV_OBJS += aes.o aes_impl.o aes_modes.o fips_aes_util.o + +ARCFOURPROV_OBJS += arcfour.o arcfour_crypt.o + +BLOWFISHPROV_OBJS += blowfish.o blowfish_impl.o + +ECCPROV_OBJS += ecc.o ec.o ec2_163.o ec2_mont.o ecdecode.o ecl_mult.o \ + ecp_384.o ecp_jac.o ec2_193.o ecl.o ecp_192.o ecp_521.o \ + ecp_jm.o ec2_233.o ecl_curve.o ecp_224.o ecp_aff.o \ + ecp_mont.o ec2_aff.o ec_naf.o ecl_gf.o ecp_256.o mp_gf2m.o \ + mpi.o mplogic.o mpmontg.o mpprime.o oid.o \ + secitem.o ec2_test.o ecp_test.o fips_ecc_util.o + +RSAPROV_OBJS += rsa.o rsa_impl.o pkcs1.o fips_rsa_util.o + +SWRANDPROV_OBJS += swrand.o fips_random_util.o + +# +# kernel SSL +# +KSSL_OBJS += kssl.o ksslioctl.o + +KSSL_SOCKFIL_MOD_OBJS += ksslfilter.o ksslapi.o ksslrec.o + +# +# misc. modules +# + +C2AUDIT_OBJS += adr.o audit.o audit_event.o audit_io.o \ + audit_path.o audit_start.o audit_syscalls.o audit_token.o \ + audit_mem.o + +PCIC_OBJS += pcic.o + +RPCSEC_OBJS += secmod.o sec_clnt.o sec_svc.o sec_gen.o \ + auth_des.o auth_kern.o auth_none.o auth_loopb.o\ + authdesprt.o authdesubr.o authu_prot.o \ + key_call.o key_prot.o svc_authu.o svcauthdes.o + +RPCSEC_GSS_OBJS += rpcsec_gssmod.o rpcsec_gss.o rpcsec_gss_misc.o \ + rpcsec_gss_utils.o svc_rpcsec_gss.o + +CONSCONFIG_OBJS += consconfig.o + +CONSCONFIG_DACF_OBJS += consconfig_dacf.o consplat.o + +TEM_OBJS += tem.o tem_safe.o 6x10.o 7x14.o 12x22.o + +KBTRANS_OBJS += \ + kbtrans.o \ + kbtrans_keytables.o \ + kbtrans_polled.o \ + kbtrans_streams.o \ + usb_keytables.o + +KGSSD_OBJS += gssd_clnt_stubs.o gssd_handle.o gssd_prot.o \ + gss_display_name.o gss_release_name.o gss_import_name.o \ + gss_release_buffer.o gss_release_oid_set.o gen_oids.o gssdmod.o + +KGSSD_DERIVED_OBJS = gssd_xdr.o + +KGSS_DUMMY_OBJS += dmech.o + +KSOCKET_OBJS += ksocket.o ksocket_mod.o + +CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \ + nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\ + checksum_length.o hmac.o default_state.o mandatory_sumtype.o + +# crypto/des +CRYPTO_DES= f_cbc.o f_cksum.o f_parity.o weak_key.o d3_cbc.o ef_crypto.o + +CRYPTO_DK= checksum.o derive.o dk_decrypt.o dk_encrypt.o + +CRYPTO_ARCFOUR= k5_arcfour.o + +# crypto/enc_provider +CRYPTO_ENC= des.o des3.o arcfour_provider.o aes_provider.o + +# crypto/hash_provider +CRYPTO_HASH= hash_kef_generic.o hash_kmd5.o hash_crc32.o hash_ksha1.o + +# crypto/keyhash_provider +CRYPTO_KEYHASH= descbc.o k5_kmd5des.o k_hmac_md5.o + +# crypto/crc32 +CRYPTO_CRC32= crc32.o + +# crypto/old +CRYPTO_OLD= old_decrypt.o old_encrypt.o + +# crypto/raw +CRYPTO_RAW= raw_decrypt.o raw_encrypt.o + +K5_KRB= kfree.o copy_key.o \ + parse.o init_ctx.o \ + ser_adata.o ser_addr.o \ + ser_auth.o ser_cksum.o \ + ser_key.o ser_princ.o \ + serialize.o unparse.o \ + ser_actx.o + +K5_OS= timeofday.o toffset.o \ + init_os_ctx.o c_ustime.o + +SEAL= +# EXPORT DELETE START +SEAL= seal.o unseal.o +# EXPORT DELETE END + +MECH= delete_sec_context.o \ + import_sec_context.o \ + gssapi_krb5.o \ + k5seal.o k5unseal.o k5sealv3.o \ + ser_sctx.o \ + sign.o \ + util_crypt.o \ + util_validate.o util_ordering.o \ + util_seqnum.o util_set.o util_seed.o \ + wrap_size_limit.o verify.o + + + +MECH_GEN= util_token.o + + +KGSS_KRB5_OBJS += krb5mech.o \ + $(MECH) $(SEAL) $(MECH_GEN) \ + $(CRYPTO) $(CRYPTO_DES) $(CRYPTO_DK) $(CRYPTO_ARCFOUR) \ + $(CRYPTO_ENC) $(CRYPTO_HASH) \ + $(CRYPTO_KEYHASH) $(CRYPTO_CRC32) \ + $(CRYPTO_OLD) \ + $(CRYPTO_RAW) $(K5_KRB) $(K5_OS) + +DES_OBJS += des_crypt.o des_impl.o des_ks.o des_soft.o fips_des_util.o + +DLBOOT_OBJS += bootparam_xdr.o nfs_dlinet.o scan.o + +KRTLD_OBJS += kobj_bootflags.o getoptstr.o \ + kobj.o kobj_kdi.o kobj_lm.o kobj_subr.o + +MOD_OBJS += modctl.o modsubr.o modsysfile.o modconf.o modhash.o + +STRPLUMB_OBJS += strplumb.o + +CPR_OBJS += cpr_driver.o cpr_dump.o \ + cpr_main.o cpr_misc.o cpr_mod.o cpr_stat.o \ + cpr_uthread.o + +PROF_OBJS += prf.o + +SE_OBJS += se_driver.o + +SYSACCT_OBJS += acct.o + +ACCTCTL_OBJS += acctctl.o + +EXACCTSYS_OBJS += exacctsys.o + +KAIO_OBJS += aio.o + +PCMCIA_OBJS += pcmcia.o cs.o cis.o cis_callout.o cis_handlers.o cis_params.o + +BUSRA_OBJS += busra.o + +PCS_OBJS += pcs.o + +PCAN_OBJS += pcan.o + +PCATA_OBJS += pcide.o pcdisk.o pclabel.o pcata.o + +PCSER_OBJS += pcser.o pcser_cis.o + +PCWL_OBJS += pcwl.o + +PSET_OBJS += pset.o + +OHCI_OBJS += ohci.o ohci_hub.o ohci_polled.o + +UHCI_OBJS += uhci.o uhciutil.o uhcitgt.o uhcihub.o uhcipolled.o + +EHCI_OBJS += ehci.o ehci_hub.o ehci_xfer.o ehci_intr.o ehci_util.o ehci_polled.o ehci_isoch.o ehci_isoch_util.o + +HUBD_OBJS += hubd.o + +USB_MID_OBJS += usb_mid.o + +USB_IA_OBJS += usb_ia.o + +UWBA_OBJS += uwba.o uwbai.o + +SCSA2USB_OBJS += scsa2usb.o usb_ms_bulkonly.o usb_ms_cbi.o + +HWAHC_OBJS += hwahc.o hwahc_util.o + +WUSB_DF_OBJS += wusb_df.o +WUSB_FWMOD_OBJS += wusb_fwmod.o + +IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ + ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \ + ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o + +IBD_OBJS += ibd.o ibd_cm.o + +EIBNX_OBJS += enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \ + enx_misc.o enx_q.o enx_ctl.o + +EOIB_OBJS += eib_adm.o eib_chan.o eib_cmn.o eib_ctl.o eib_data.o \ + eib_fip.o eib_ibt.o eib_log.o eib_mac.o eib_main.o \ + eib_rsrc.o eib_svc.o eib_vnic.o + +DLPISTUB_OBJS += dlpistub.o + +SDP_OBJS += sdpddi.o + +TRILL_OBJS += trill.o + +CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \ + ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o + +SMBIOS_OBJS += smb_error.o smb_info.o smb_open.o smb_subr.o smb_dev.o + +RPCIB_OBJS += rpcib.o + +KMDB_OBJS += kdrv.o + +AFE_OBJS += afe.o + +BGE_OBJS += bge_main2.o bge_chip2.o bge_kstats.o bge_log.o bge_ndd.o \ + bge_atomic.o bge_mii.o bge_send.o bge_recv2.o bge_mii_5906.o + +DMFE_OBJS += dmfe_log.o dmfe_main.o dmfe_mii.o + +ELXL_OBJS += elxl.o + +HME_OBJS += hme.o + +IXGB_OBJS += ixgb.o ixgb_atomic.o ixgb_chip.o ixgb_gld.o ixgb_kstats.o \ + ixgb_log.o ixgb_ndd.o ixgb_rx.o ixgb_tx.o ixgb_xmii.o + +NGE_OBJS += nge_main.o nge_atomic.o nge_chip.o nge_ndd.o nge_kstats.o \ + nge_log.o nge_rx.o nge_tx.o nge_xmii.o + +RGE_OBJS += rge_main.o rge_chip.o rge_ndd.o rge_kstats.o rge_log.o rge_rxtx.o + +URTW_OBJS += urtw.o + +ARN_OBJS += arn_hw.o arn_eeprom.o arn_mac.o arn_calib.o arn_ani.o arn_phy.o arn_regd.o arn_beacon.o \ + arn_main.o arn_recv.o arn_xmit.o arn_rc.o + +ATH_OBJS += ath_aux.o ath_main.o ath_osdep.o ath_rate.o + +ATU_OBJS += atu.o + +IPW_OBJS += ipw2100_hw.o ipw2100.o + +IWI_OBJS += ipw2200_hw.o ipw2200.o + +IWH_OBJS += iwh.o + +IWK_OBJS += iwk2.o + +IWP_OBJS += iwp.o + +MWL_OBJS += mwl.o + +MWLFW_OBJS += mwlfw_mode.o + +WPI_OBJS += wpi.o + +RAL_OBJS += rt2560.o ral_rate.o + +RUM_OBJS += rum.o + +RWD_OBJS += rt2661.o + +RWN_OBJS += rt2860.o + +UATH_OBJS += uath.o + +UATHFW_OBJS += uathfw_mod.o + +URAL_OBJS += ural.o + +RTW_OBJS += rtw.o smc93cx6.o rtwphy.o rtwphyio.o + +ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o + +MXFE_OBJS += mxfe.o + +MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o mptsas_raid.o mptsas_smhba.o + +SFE_OBJS += sfe.o sfe_util.o + +BFE_OBJS += bfe.o + +BRIDGE_OBJS += bridge.o + +IDM_SHARED_OBJS += base64.o + +IDM_OBJS += $(IDM_SHARED_OBJS) \ + idm.o idm_impl.o idm_text.o idm_conn_sm.o idm_so.o + +VR_OBJS += vr.o + +ATGE_OBJS += atge_main.o atge_l1e.o atge_mii.o atge_l1.o + +YGE_OBJS = yge.o + +# +# Build up defines and paths. +# +LINT_DEFS += -Dunix + +# +# This duality can be removed when the native and target compilers +# are the same (or at least recognize the same command line syntax!) +# It is a bug in the current compilation system that the assember +# can't process the -Y I, flag. +# +NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common +INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common + +PCIEB_OBJS += pcieb.o + +# Chelsio N110 10G NIC driver module +# +CH_OBJS = ch.o glue.o pe.o sge.o + +CH_COM_OBJS = ch_mac.o ch_subr.o cspi.o espi.o ixf1010.o mc3.o mc4.o mc5.o \ + mv88e1xxx.o mv88x201x.o my3126.o pm3393.o tp.o ulp.o \ + vsc7321.o vsc7326.o xpak.o + +# +# PCI strings file +# +PCI_STRING_OBJS = pci_strings.o + +NET_DACF_OBJS += net_dacf.o + +# +# Xframe 10G NIC driver module +# +XGE_OBJS = xge.o xgell.o + +XGE_HAL_OBJS = xgehal-channel.o xgehal-fifo.o xgehal-ring.o xgehal-config.o \ + xgehal-driver.o xgehal-mm.o xgehal-stats.o xgehal-device.o \ + xge-queue.o xgehal-mgmt.o xgehal-mgmtaux.o + +# +# e1000g module +# +E1000G_OBJS += e1000_80003es2lan.o e1000_82540.o e1000_82541.o e1000_82542.o \ + e1000_82543.o e1000_82571.o e1000_api.o e1000_ich8lan.o \ + e1000_mac.o e1000_manage.o e1000_nvm.o e1000_osdep.o \ + e1000_phy.o e1000g_debug.o e1000g_main.o e1000g_alloc.o \ + e1000g_tx.o e1000g_rx.o e1000g_stat.o + +# +# Intel 82575 1G NIC driver module +# +IGB_OBJS = igb_82575.o igb_api.o igb_mac.o igb_manage.o \ + igb_nvm.o igb_osdep.o igb_phy.o igb_buf.o \ + igb_debug.o igb_gld.o igb_log.o igb_main.o \ + igb_rx.o igb_stat.o igb_tx.o + +# +# Intel 10GbE PCIE NIC driver module +# +IXGBE_OBJS = ixgbe_82598.o ixgbe_82599.o ixgbe_api.o \ + ixgbe_common.o ixgbe_phy.o \ + ixgbe_buf.o ixgbe_debug.o ixgbe_gld.o \ + ixgbe_log.o ixgbe_main.o \ + ixgbe_osdep.o ixgbe_rx.o ixgbe_stat.o \ + ixgbe_tx.o + +# +# NIU 10G/1G driver module +# +NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \ + nxge_txdma.o nxge_txc.o nxge_main.o \ + nxge_hw.o nxge_fzc.o nxge_virtual.o \ + nxge_send.o nxge_classify.o nxge_fflp.o \ + nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o \ + nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o \ + nxge_hio.o nxge_hio_guest.o nxge_intr.o + +NXGE_NPI_OBJS = \ + npi.o npi_mac.o npi_ipp.o \ + npi_txdma.o npi_rxdma.o npi_txc.o \ + npi_zcp.o npi_espc.o npi_fflp.o \ + npi_vir.o + +NXGE_HCALL_OBJS = \ + nxge_hcall.o + +# +# kiconv modules +# +KICONV_EMEA_OBJS += kiconv_emea.o + +# +# blk2scsa +# +BLK2SCSA_OBJS = blk2scsa.o + +KICONV_JA_OBJS += kiconv_ja.o + +KICONV_KO_OBJS += kiconv_cck_common.o kiconv_ko.o + +KICONV_SC_OBJS += kiconv_cck_common.o kiconv_sc.o + +KICONV_TC_OBJS += kiconv_cck_common.o kiconv_tc.o + +# +# AAC module +# +AAC_OBJS = aac.o aac_ioctl.o + +# +# sdcard modules +# +SDA_OBJS = sda_cmd.o sda_host.o sda_init.o sda_mem.o sda_mod.o sda_slot.o +SDHOST_OBJS = sdhost.o + +# +# hxge 10G driver module +# +HXGE_OBJS = hxge_main.o hxge_vmac.o hxge_send.o \ + hxge_txdma.o hxge_rxdma.o hxge_virtual.o \ + hxge_fm.o hxge_fzc.o hxge_hw.o hxge_kstats.o \ + hxge_ndd.o hxge_pfc.o \ + hpi.o hpi_vmac.o hpi_rxdma.o hpi_txdma.o \ + hpi_vir.o hpi_pfc.o + +# +# MEGARAID_SAS module +# +MEGA_SAS_OBJS = megaraid_sas.o + +# +# MR_SAS module +# +MR_SAS_OBJS = mr_sas.o + +# +# ISCSI_INITIATOR module +# +ISCSI_INITIATOR_OBJS = chap.o iscsi_io.o iscsi_thread.o \ + iscsi_ioctl.o iscsid.o iscsi.o \ + iscsi_login.o isns_client.o iscsiAuthClient.o \ + iscsi_lun.o iscsiAuthClientGlue.o \ + iscsi_net.o nvfile.o iscsi_cmd.o \ + iscsi_queue.o persistent.o iscsi_conn.o \ + iscsi_sess.o radius_auth.o iscsi_crc.o \ + iscsi_stats.o radius_packet.o iscsi_doorclt.o \ + iscsi_targetparam.o utils.o kifconf.o + +# +# ntxn 10Gb/1Gb NIC driver module +# +NTXN_OBJS = unm_nic_init.o unm_gem.o unm_nic_hw.o unm_ndd.o \ + unm_nic_main.o unm_nic_isr.o unm_nic_ctx.o niu.o + +# +# Myricom 10Gb NIC driver module +# +MYRI10GE_OBJS = myri10ge.o myri10ge_lro.o + +# nulldriver module +# +NULLDRIVER_OBJS = nulldriver.o + +TPM_OBJS = tpm.o tpm_hcall.o diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c index c721386280f8..2a9df6d403f2 100644 --- a/uts/common/dtrace/dtrace.c +++ b/uts/common/dtrace/dtrace.c @@ -20,12 +20,9 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DTrace - Dynamic Tracing for Solaris * @@ -186,7 +183,9 @@ static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */ static dtrace_genid_t dtrace_probegen; /* current probe generation */ static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */ static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */ +static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ +static int dtrace_dynvar_failclean; /* dynvars failed to clean */ /* * DTrace Locking @@ -240,10 +239,16 @@ static void dtrace_nullop(void) {} +static int +dtrace_enable_nullop(void) +{ + return (0); +} + static dtrace_pops_t dtrace_provider_ops = { (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, (void (*)(void *, struct modctl *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, @@ -427,6 +432,7 @@ dtrace_load##bits(uintptr_t addr) \ #define DTRACE_DYNHASH_SINK 1 #define DTRACE_DYNHASH_VALID 2 +#define DTRACE_MATCH_FAIL -1 #define DTRACE_MATCH_NEXT 0 #define DTRACE_MATCH_DONE 1 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') @@ -1182,12 +1188,12 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate) { dtrace_dynvar_t *dirty; dtrace_dstate_percpu_t *dcpu; - int i, work = 0; + dtrace_dynvar_t **rinsep; + int i, j, work = 0; for (i = 0; i < NCPU; i++) { dcpu = &dstate->dtds_percpu[i]; - - ASSERT(dcpu->dtdsc_rinsing == NULL); + rinsep = &dcpu->dtdsc_rinsing; /* * If the dirty list is NULL, there is no dirty work to do. @@ -1195,14 +1201,62 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate) if (dcpu->dtdsc_dirty == NULL) continue; - /* - * If the clean list is non-NULL, then we're not going to do - * any work for this CPU -- it means that there has not been - * a dtrace_dynvar() allocation on this CPU (or from this CPU) - * since the last time we cleaned house. - */ - if (dcpu->dtdsc_clean != NULL) + if (dcpu->dtdsc_rinsing != NULL) { + /* + * If the rinsing list is non-NULL, then it is because + * this CPU was selected to accept another CPU's + * dirty list -- and since that time, dirty buffers + * have accumulated. This is a highly unlikely + * condition, but we choose to ignore the dirty + * buffers -- they'll be picked up a future cleanse. + */ continue; + } + + if (dcpu->dtdsc_clean != NULL) { + /* + * If the clean list is non-NULL, then we're in a + * situation where a CPU has done deallocations (we + * have a non-NULL dirty list) but no allocations (we + * also have a non-NULL clean list). We can't simply + * move the dirty list into the clean list on this + * CPU, yet we also don't want to allow this condition + * to persist, lest a short clean list prevent a + * massive dirty list from being cleaned (which in + * turn could lead to otherwise avoidable dynamic + * drops). To deal with this, we look for some CPU + * with a NULL clean list, NULL dirty list, and NULL + * rinsing list -- and then we borrow this CPU to + * rinse our dirty list. + */ + for (j = 0; j < NCPU; j++) { + dtrace_dstate_percpu_t *rinser; + + rinser = &dstate->dtds_percpu[j]; + + if (rinser->dtdsc_rinsing != NULL) + continue; + + if (rinser->dtdsc_dirty != NULL) + continue; + + if (rinser->dtdsc_clean != NULL) + continue; + + rinsep = &rinser->dtdsc_rinsing; + break; + } + + if (j == NCPU) { + /* + * We were unable to find another CPU that + * could accept this dirty list -- we are + * therefore unable to clean it now. + */ + dtrace_dynvar_failclean++; + continue; + } + } work = 1; @@ -1219,7 +1273,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate) * on a hash chain, either the dirty list or the * rinsing list for some CPU must be non-NULL.) */ - dcpu->dtdsc_rinsing = dirty; + *rinsep = dirty; dtrace_membar_producer(); } while (dtrace_casptr(&dcpu->dtdsc_dirty, dirty, NULL) != dirty); @@ -1650,7 +1704,7 @@ retry: ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE); /* - * Now we'll move the clean list to the free list. + * Now we'll move the clean list to our free list. * It's impossible for this to fail: the only way * the free list can be updated is through this * code path, and only one CPU can own the clean list. @@ -1663,6 +1717,7 @@ retry: * owners of the clean lists out before resetting * the clean lists. */ + dcpu = &dstate->dtds_percpu[me]; rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean); ASSERT(rval == NULL); goto retry; @@ -3600,7 +3655,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, int64_t index = (int64_t)tupregs[1].dttk_value; int64_t remaining = (int64_t)tupregs[2].dttk_value; size_t len = dtrace_strlen((char *)s, size); - int64_t i = 0; + int64_t i; if (!dtrace_canload(s, len + 1, mstate, vstate)) { regs[rd] = NULL; @@ -6655,7 +6710,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, { dtrace_probe_t template, *probe; dtrace_hash_t *hash = NULL; - int len, best = INT_MAX, nmatched = 0; + int len, rc, best = INT_MAX, nmatched = 0; dtrace_id_t i; ASSERT(MUTEX_HELD(&dtrace_lock)); @@ -6667,7 +6722,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, if (pkp->dtpk_id != DTRACE_IDNONE) { if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL && dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) { - (void) (*matched)(probe, arg); + if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); nmatched++; } return (nmatched); @@ -6714,8 +6770,12 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + if ((rc = (*matched)(probe, arg)) != + DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); break; + } } return (nmatched); @@ -6734,8 +6794,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); break; + } } return (nmatched); @@ -6955,7 +7018,7 @@ dtrace_unregister(dtrace_provider_id_t id) dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) { + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { /* * If DTrace itself is the provider, we're called with locks * already held. @@ -7101,7 +7164,7 @@ dtrace_invalidate(dtrace_provider_id_t id) dtrace_provider_t *pvp = (dtrace_provider_t *)id; ASSERT(pvp->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -7142,7 +7205,7 @@ dtrace_condense(dtrace_provider_id_t id) * Make sure this isn't the dtrace provider itself. */ ASSERT(prov->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -8103,7 +8166,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, break; default: - err += efunc(dp->dtdo_len - 1, "bad return size"); + err += efunc(dp->dtdo_len - 1, "bad return size\n"); } } @@ -9096,7 +9159,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) return (ecb); } -static void +static int dtrace_ecb_enable(dtrace_ecb_t *ecb) { dtrace_probe_t *probe = ecb->dte_probe; @@ -9109,7 +9172,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) /* * This is the NULL probe -- there's nothing to do. */ - return; + return (0); } if (probe->dtpr_ecb == NULL) { @@ -9123,8 +9186,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) if (ecb->dte_predicate != NULL) probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid; - prov->dtpv_pops.dtps_enable(prov->dtpv_arg, - probe->dtpr_id, probe->dtpr_arg); + return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg)); } else { /* * This probe is already active. Swing the last pointer to @@ -9137,6 +9200,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) probe->dtpr_predcache = 0; dtrace_sync(); + return (0); } } @@ -9920,7 +9984,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg) if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL) return (DTRACE_MATCH_DONE); - dtrace_ecb_enable(ecb); + if (dtrace_ecb_enable(ecb) < 0) + return (DTRACE_MATCH_FAIL); + return (DTRACE_MATCH_NEXT); } @@ -10557,6 +10623,7 @@ dtrace_enabling_destroy(dtrace_enabling_t *enab) ASSERT(enab->dten_vstate->dtvs_state != NULL); ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0); enab->dten_vstate->dtvs_state->dts_nretained--; + dtrace_retained_gen++; } if (enab->dten_prev == NULL) { @@ -10599,6 +10666,7 @@ dtrace_enabling_retain(dtrace_enabling_t *enab) return (ENOSPC); state->dts_nretained++; + dtrace_retained_gen++; if (dtrace_retained == NULL) { dtrace_retained = enab; @@ -10713,7 +10781,7 @@ static int dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) { int i = 0; - int matched = 0; + int total_matched = 0, matched = 0; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&dtrace_lock)); @@ -10724,7 +10792,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_current = ep; enab->dten_error = 0; - matched += dtrace_probe_enable(&ep->dted_probe, enab); + /* + * If a provider failed to enable a probe then get out and + * let the consumer know we failed. + */ + if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0) + return (EBUSY); + + total_matched += matched; if (enab->dten_error != 0) { /* @@ -10752,7 +10827,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_probegen = dtrace_probegen; if (nmatched != NULL) - *nmatched = matched; + *nmatched = total_matched; return (0); } @@ -10766,13 +10841,22 @@ dtrace_enabling_matchall(void) mutex_enter(&dtrace_lock); /* - * Because we can be called after dtrace_detach() has been called, we - * cannot assert that there are retained enablings. We can safely - * load from dtrace_retained, however: the taskq_destroy() at the - * end of dtrace_detach() will block pending our completion. + * Iterate over all retained enablings to see if any probes match + * against them. We only perform this operation on enablings for which + * we have sufficient permissions by virtue of being in the global zone + * or in the same zone as the DTrace client. Because we can be called + * after dtrace_detach() has been called, we cannot assert that there + * are retained enablings. We can safely load from dtrace_retained, + * however: the taskq_destroy() at the end of dtrace_detach() will + * block pending our completion. */ - for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) - (void) dtrace_enabling_match(enab, NULL); + for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { + cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred; + + if (INGLOBALZONE(curproc) || + cr != NULL && getzoneid() == crgetzoneid(cr)) + (void) dtrace_enabling_match(enab, NULL); + } mutex_exit(&dtrace_lock); mutex_exit(&cpu_lock); @@ -10830,6 +10914,7 @@ dtrace_enabling_provide(dtrace_provider_t *prv) { int i, all = 0; dtrace_probedesc_t desc; + dtrace_genid_t gen; ASSERT(MUTEX_HELD(&dtrace_lock)); ASSERT(MUTEX_HELD(&dtrace_provider_lock)); @@ -10840,15 +10925,25 @@ dtrace_enabling_provide(dtrace_provider_t *prv) } do { - dtrace_enabling_t *enab = dtrace_retained; + dtrace_enabling_t *enab; void *parg = prv->dtpv_arg; - for (; enab != NULL; enab = enab->dten_next) { +retry: + gen = dtrace_retained_gen; + for (enab = dtrace_retained; enab != NULL; + enab = enab->dten_next) { for (i = 0; i < enab->dten_ndesc; i++) { desc = enab->dten_desc[i]->dted_probe; mutex_exit(&dtrace_lock); prv->dtpv_pops.dtps_provide(parg, &desc); mutex_enter(&dtrace_lock); + /* + * Process the retained enablings again if + * they have changed while we weren't holding + * dtrace_lock. + */ + if (gen != dtrace_retained_gen) + goto retry; } } } while (all && (prv = prv->dtpv_next) != NULL); @@ -10970,7 +11065,8 @@ dtrace_dof_copyin(uintptr_t uarg, int *errp) dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP); - if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { kmem_free(dof, hdr.dofh_loadsz); *errp = EFAULT; return (NULL); @@ -11698,6 +11794,13 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr, } } + if (DOF_SEC_ISLOADABLE(sec->dofs_type) && + !(sec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "loadable section with load " + "flag unset"); + return (-1); + } + if (!(sec->dofs_flags & DOF_SECF_LOAD)) continue; /* just ignore non-loadable sections */ @@ -14390,7 +14493,8 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) * If this wasn't an open with the "helper" minor, then it must be * the "dtrace" minor. */ - ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE); + if (getminor(*devp) != DTRACEMNRN_DTRACE) + return (ENXIO); /* * If no DTRACE_PRIV_* bits are set in the credential, then the @@ -14427,7 +14531,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) mutex_exit(&cpu_lock); if (state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); mutex_exit(&dtrace_lock); return (EAGAIN); @@ -14463,7 +14567,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_destroy(state); ASSERT(dtrace_opens > 0); - if (--dtrace_opens == 0) + + /* + * Only relinquish control of the kernel debugger interface when there + * are no consumers and no anonymous enablings. + */ + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); mutex_exit(&dtrace_lock); @@ -15458,7 +15567,8 @@ static struct dev_ops dtrace_ops = { nodev, /* reset */ &dtrace_cb_ops, /* driver operations */ NULL, /* bus operations */ - nodev /* dev power */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv modldrv = { diff --git a/uts/common/dtrace/fasttrap.c b/uts/common/dtrace/fasttrap.c index b7ca92f54a59..42263e4ef274 100644 --- a/uts/common/dtrace/fasttrap.c +++ b/uts/common/dtrace/fasttrap.c @@ -20,11 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include <sys/atomic.h> #include <sys/errno.h> @@ -876,7 +875,7 @@ fasttrap_disable_callbacks(void) } /*ARGSUSED*/ -static void +static int fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; @@ -904,7 +903,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) - return; + return (0); /* * If we can't find the process, it may be that we're in the context of @@ -913,7 +912,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) */ if ((p = sprlock(probe->ftp_pid)) == NULL) { if ((curproc->p_flag & SFORKING) == 0) - return; + return (0); mutex_enter(&pidlock); p = prfind(probe->ftp_pid); @@ -975,7 +974,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); - return; + return (0); } } @@ -983,6 +982,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) sprunlock(p); probe->ftp_enabled = 1; + return (0); } /*ARGSUSED*/ @@ -1946,7 +1946,8 @@ fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) probe = kmem_alloc(size, KM_SLEEP); - if (copyin(uprobe, probe, size) != 0) { + if (copyin(uprobe, probe, size) != 0 || + probe->ftps_noffs != noffs) { kmem_free(probe, size); return (EFAULT); } @@ -2044,13 +2045,6 @@ err: tp->ftt_proc->ftpc_acount != 0) break; - /* - * The count of active providers can only be - * decremented (i.e. to zero) during exec, exit, and - * removal of a meta provider so it should be - * impossible to drop the count during this operation(). - */ - ASSERT(tp->ftt_proc->ftpc_acount != 0); tp = tp->ftt_next; } @@ -2346,7 +2340,8 @@ static struct dev_ops fasttrap_ops = { nodev, /* reset */ &fasttrap_cb_ops, /* driver operations */ NULL, /* bus operations */ - nodev /* dev power */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ }; /* diff --git a/uts/common/dtrace/lockstat.c b/uts/common/dtrace/lockstat.c index 3eb76a061d32..69c8b7254486 100644 --- a/uts/common/dtrace/lockstat.c +++ b/uts/common/dtrace/lockstat.c @@ -19,11 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> #include <sys/param.h> @@ -84,7 +83,7 @@ static kmutex_t lockstat_test; /* for testing purposes only */ static dtrace_provider_id_t lockstat_id; /*ARGSUSED*/ -static void +static int lockstat_enable(void *arg, dtrace_id_t id, void *parg) { lockstat_probe_t *probe = parg; @@ -103,6 +102,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) */ mutex_enter(&lockstat_test); mutex_exit(&lockstat_test); + return (0); } /*ARGSUSED*/ @@ -310,11 +310,13 @@ static struct dev_ops lockstat_ops = { nulldev, /* reset */ &lockstat_cb_ops, /* cb_ops */ NULL, /* bus_ops */ + NULL, /* power */ + ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv modldrv = { &mod_driverops, /* Type of module. This one is a driver */ - "Lock Statistics %I%", /* name of module */ + "Lock Statistics", /* name of module */ &lockstat_ops, /* driver ops */ }; diff --git a/uts/common/dtrace/profile.c b/uts/common/dtrace/profile.c index 8de919a851a2..c1a2d1f1c12f 100644 --- a/uts/common/dtrace/profile.c +++ b/uts/common/dtrace/profile.c @@ -19,11 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include <sys/errno.h> #include <sys/stat.h> @@ -361,7 +360,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg) } /*ARGSUSED*/ -static void +static int profile_enable(void *arg, dtrace_id_t id, void *parg) { profile_probe_t *prof = parg; @@ -391,6 +390,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg) } else { prof->prof_cyclic = cyclic_add_omni(&omni); } + return (0); } /*ARGSUSED*/ @@ -539,7 +539,8 @@ static struct dev_ops profile_ops = { nodev, /* reset */ &profile_cb_ops, /* driver operations */ NULL, /* bus operations */ - nodev /* dev power */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ }; /* diff --git a/uts/common/dtrace/sdt_subr.c b/uts/common/dtrace/sdt_subr.c index 66ff8a92a01b..242185071bb2 100644 --- a/uts/common/dtrace/sdt_subr.c +++ b/uts/common/dtrace/sdt_subr.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sdt_impl.h> static dtrace_pattr_t vtrace_attr = { @@ -43,6 +40,14 @@ static dtrace_pattr_t info_attr = { { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, }; +static dtrace_pattr_t fc_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +}; + static dtrace_pattr_t fpu_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, @@ -83,6 +88,14 @@ static dtrace_pattr_t xpv_attr = { { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, }; +static dtrace_pattr_t iscsi_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +}; + sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr, 0 }, { "sysinfo", "__cpu_sysinfo_", &info_attr, 0 }, @@ -91,11 +104,17 @@ sdt_provider_t sdt_providers[] = { { "sched", "__sched_", &stab_attr, 0 }, { "proc", "__proc_", &stab_attr, 0 }, { "io", "__io_", &stab_attr, 0 }, + { "ip", "__ip_", &stab_attr, 0 }, + { "tcp", "__tcp_", &stab_attr, 0 }, + { "udp", "__udp_", &stab_attr, 0 }, { "mib", "__mib_", &stab_attr, 0 }, { "fsinfo", "__fsinfo_", &fsinfo_attr, 0 }, + { "iscsi", "__iscsi_", &iscsi_attr, 0 }, { "nfsv3", "__nfsv3_", &stab_attr, 0 }, { "nfsv4", "__nfsv4_", &stab_attr, 0 }, { "xpv", "__xpv_", &xpv_attr, 0 }, + { "fc", "__fc_", &fc_attr, 0 }, + { "srp", "__srp_", &fc_attr, 0 }, { "sysevent", "__sysevent_", &stab_attr, 0 }, { "sdt", NULL, &sdt_attr, 0 }, { NULL } @@ -169,6 +188,73 @@ sdt_argdesc_t sdt_args[] = { { "fsinfo", NULL, 0, 0, "vnode_t *", "fileinfo_t *" }, { "fsinfo", NULL, 1, 1, "int", "int" }, + { "iscsi", "async-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "async-send", 1, 1, "iscsi_async_evt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "login-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "login-command", 1, 1, "iscsi_login_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "login-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "login-response", 1, 1, "iscsi_login_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "logout-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "logout-command", 1, 1, "iscsi_logout_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "logout-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "logout-response", 1, 1, "iscsi_logout_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-request", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-request", 1, 1, "iscsi_rtt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-send", 1, 1, "iscsi_data_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-receive", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-receive", 1, 1, "iscsi_data_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "nop-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "nop-send", 1, 1, "iscsi_nop_in_hdr_t *", "iscsiinfo_t *" }, + { "iscsi", "nop-receive", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "nop-receive", 1, 1, "iscsi_nop_out_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "scsi-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "scsi-command", 1, 1, "iscsi_scsi_cmd_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "scsi-command", 2, 2, "scsi_task_t *", "scsicmd_t *" }, + { "iscsi", "scsi-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "scsi-response", 1, 1, "iscsi_scsi_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "task-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "task-command", 1, 1, "iscsi_scsi_task_mgt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "task-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "task-response", 1, 1, "iscsi_scsi_task_mgt_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "text-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "text-command", 1, 1, "iscsi_text_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "text-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "text-response", 1, 1, "iscsi_text_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "xfer-start", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "xfer-start", 1, 0, "idm_conn_t *", "iscsiinfo_t *" }, + { "iscsi", "xfer-start", 2, 1, "uintptr_t", "xferinfo_t *" }, + { "iscsi", "xfer-start", 3, 2, "uint32_t"}, + { "iscsi", "xfer-start", 4, 3, "uintptr_t"}, + { "iscsi", "xfer-start", 5, 4, "uint32_t"}, + { "iscsi", "xfer-start", 6, 5, "uint32_t"}, + { "iscsi", "xfer-start", 7, 6, "uint32_t"}, + { "iscsi", "xfer-start", 8, 7, "int"}, + { "iscsi", "xfer-done", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "xfer-done", 1, 0, "idm_conn_t *", "iscsiinfo_t *" }, + { "iscsi", "xfer-done", 2, 1, "uintptr_t", "xferinfo_t *" }, + { "iscsi", "xfer-done", 3, 2, "uint32_t"}, + { "iscsi", "xfer-done", 4, 3, "uintptr_t"}, + { "iscsi", "xfer-done", 5, 4, "uint32_t"}, + { "iscsi", "xfer-done", 6, 5, "uint32_t"}, + { "iscsi", "xfer-done", 7, 6, "uint32_t"}, + { "iscsi", "xfer-done", 8, 7, "int"}, + { "nfsv3", "op-getattr-start", 0, 0, "struct svc_req *", "conninfo_t *" }, { "nfsv3", "op-getattr-start", 1, 1, "nfsv3oparg_t *", @@ -788,6 +874,75 @@ sdt_argdesc_t sdt_args[] = { "nfsv4cbinfo_t *" }, { "nfsv4", "cb-recall-done", 2, 2, "CB_RECALL4res *" }, + { "ip", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "ip", "send", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "ip", "send", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, + { "ip", "send", 4, 4, "ipha_t *", "ipv4info_t *" }, + { "ip", "send", 5, 5, "ip6_t *", "ipv6info_t *" }, + { "ip", "send", 6, 6, "int" }, /* used by __dtrace_ipsr_ill_t */ + { "ip", "receive", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "ip", "receive", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "ip", "receive", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, + { "ip", "receive", 4, 4, "ipha_t *", "ipv4info_t *" }, + { "ip", "receive", 5, 5, "ip6_t *", "ipv6info_t *" }, + { "ip", "receive", 6, 6, "int" }, /* used by __dtrace_ipsr_ill_t */ + + { "tcp", "connect-established", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "connect-established", 1, 1, "ip_xmit_attr_t *", + "csinfo_t *" }, + { "tcp", "connect-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-established", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "connect-established", 4, 4, "tcph_t *", "tcpinfo_t *" }, + { "tcp", "connect-refused", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "connect-refused", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "connect-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-refused", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "connect-refused", 4, 4, "tcph_t *", "tcpinfo_t *" }, + { "tcp", "connect-request", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "connect-request", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "connect-request", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-request", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "connect-request", 4, 4, "tcph_t *", "tcpinfo_t *" }, + { "tcp", "accept-established", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "accept-established", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "accept-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-established", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "accept-established", 4, 4, "tcph_t *", "tcpinfo_t *" }, + { "tcp", "accept-refused", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "accept-refused", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "accept-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-refused", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "accept-refused", 4, 4, "tcph_t *", "tcpinfo_t *" }, + { "tcp", "state-change", 0, 0, "void", "void" }, + { "tcp", "state-change", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "state-change", 2, 2, "void", "void" }, + { "tcp", "state-change", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "state-change", 4, 4, "void", "void" }, + { "tcp", "state-change", 5, 5, "int32_t", "tcplsinfo_t *" }, + { "tcp", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "send", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "send", 2, 2, "__dtrace_tcp_void_ip_t *", "ipinfo_t *" }, + { "tcp", "send", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "send", 4, 4, "__dtrace_tcp_tcph_t *", "tcpinfo_t *" }, + { "tcp", "receive", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "tcp", "receive", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "tcp", "receive", 2, 2, "__dtrace_tcp_void_ip_t *", "ipinfo_t *" }, + { "tcp", "receive", 3, 3, "tcp_t *", "tcpsinfo_t *" }, + { "tcp", "receive", 4, 4, "__dtrace_tcp_tcph_t *", "tcpinfo_t *" }, + + { "udp", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "udp", "send", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "udp", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "udp", "send", 3, 3, "udp_t *", "udpsinfo_t *" }, + { "udp", "send", 4, 4, "udpha_t *", "udpinfo_t *" }, + { "udp", "receive", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "udp", "receive", 1, 1, "ip_xmit_attr_t *", "csinfo_t *" }, + { "udp", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "udp", "receive", 3, 3, "udp_t *", "udpsinfo_t *" }, + { "udp", "receive", 4, 4, "udpha_t *", "udpinfo_t *" }, + { "sysevent", "post", 0, 0, "evch_bind_t *", "syseventchaninfo_t *" }, { "sysevent", "post", 1, 1, "sysevent_impl_t *", "syseventinfo_t *" }, @@ -848,6 +1003,154 @@ sdt_argdesc_t sdt_args[] = { { "xpv", "setvcpucontext-end", 0, 0, "int" }, { "xpv", "setvcpucontext-start", 0, 0, "domid_t" }, { "xpv", "setvcpucontext-start", 1, 1, "vcpu_guest_context_t *" }, + + { "srp", "service-up", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "service-up", 1, 0, "srpt_session_t *", "srp_portinfo_t *" }, + { "srp", "service-down", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "service-down", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-command", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "login-command", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-command", 2, 1, "srp_login_req_t *", + "srp_logininfo_t *" }, + { "srp", "login-response", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "login-response", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-response", 2, 1, "srp_login_rsp_t *", + "srp_logininfo_t *" }, + { "srp", "login-response", 3, 2, "srp_login_rej_t *" }, + { "srp", "logout-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "logout-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "task-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-command", 2, 1, "srp_cmd_req_t *", "srp_taskinfo_t *" }, + { "srp", "task-response", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "task-response", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" }, + { "srp", "task-response", 3, 2, "scsi_task_t *" }, + { "srp", "task-response", 4, 3, "int8_t" }, + { "srp", "scsi-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "scsi-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "scsi-command", 2, 1, "scsi_task_t *", "scsicmd_t *" }, + { "srp", "scsi-command", 3, 2, "srp_cmd_req_t *", "srp_taskinfo_t *" }, + { "srp", "scsi-response", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "scsi-response", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "scsi-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" }, + { "srp", "scsi-response", 3, 2, "scsi_task_t *" }, + { "srp", "scsi-response", 4, 3, "int8_t" }, + { "srp", "xfer-start", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "xfer-start", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "xfer-start", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" }, + { "srp", "xfer-start", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" }, + { "srp", "xfer-start", 4, 3, "ibt_send_wr_t *"}, + { "srp", "xfer-start", 5, 4, "uint32_t" }, + { "srp", "xfer-start", 6, 5, "uint32_t" }, + { "srp", "xfer-start", 7, 6, "uint32_t" }, + { "srp", "xfer-start", 8, 7, "uint32_t" }, + { "srp", "xfer-done", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "xfer-done", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "xfer-done", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" }, + { "srp", "xfer-done", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" }, + { "srp", "xfer-done", 4, 3, "ibt_send_wr_t *"}, + { "srp", "xfer-done", 5, 4, "uint32_t" }, + { "srp", "xfer-done", 6, 5, "uint32_t" }, + { "srp", "xfer-done", 7, 6, "uint32_t" }, + { "srp", "xfer-done", 8, 7, "uint32_t" }, + + { "fc", "link-up", 0, 0, "fct_i_local_port_t *", "conninfo_t *" }, + { "fc", "link-down", 0, 0, "fct_i_local_port_t *", "conninfo_t *" }, + { "fc", "fabric-login-start", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "fabric-login-start", 1, 0, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "fabric-login-end", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "fabric-login-end", 1, 0, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-login-start", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 3, 3, "int", "int" }, + { "fc", "rport-login-end", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-login-end", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-end", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-end", 3, 3, "int", "int" }, + { "fc", "rport-login-end", 4, 4, "int", "int" }, + { "fc", "rport-logout-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-logout-start", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-start", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-start", 3, 3, "int", "int" }, + { "fc", "rport-logout-end", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-logout-end", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-end", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-end", 3, 3, "int", "int" }, + { "fc", "scsi-command", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "scsi-command", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-command", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "scsi-command", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-response", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "scsi-response", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-response", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "scsi-response", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "xfer-start", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "xfer-start", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 4, 4, "stmf_data_buf_t *", + "fc_xferinfo_t *" }, + { "fc", "xfer-done", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "xfer-done", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-done", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "xfer-done", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-done", 4, 4, "stmf_data_buf_t *", + "fc_xferinfo_t *" }, + { "fc", "rscn-receive", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "rscn-receive", 1, 1, "int", "int"}, + { "fc", "abts-receive", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "abts-receive", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + + { NULL } }; diff --git a/uts/common/dtrace/systrace.c b/uts/common/dtrace/systrace.c index be14660b04c0..b864041c450d 100644 --- a/uts/common/dtrace/systrace.c +++ b/uts/common/dtrace/systrace.c @@ -19,11 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include <sys/dtrace.h> #include <sys/systrace.h> @@ -141,7 +140,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int systrace_enable(void *arg, dtrace_id_t id, void *parg) { int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); @@ -162,7 +161,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); - return; + return (0); } (void) casptr(&sysent[sysnum].sy_callc, @@ -173,6 +172,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) (void *)systrace_sysent32[sysnum].stsy_underlying, (void *)dtrace_systrace_syscall32); #endif + return (0); } /*ARGSUSED*/ @@ -336,7 +336,8 @@ static struct dev_ops systrace_ops = { nodev, /* reset */ &systrace_cb_ops, /* driver operations */ NULL, /* bus operations */ - nodev /* dev power */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ }; /* diff --git a/uts/common/fs/gfs.c b/uts/common/fs/gfs.c new file mode 100644 index 000000000000..4d24df60f75b --- /dev/null +++ b/uts/common/fs/gfs.c @@ -0,0 +1,1178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Portions Copyright 2007 Shivakumar GN */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/kmem.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/sunddi.h> +#include <sys/uio.h> +#include <sys/vmsystm.h> +#include <sys/vfs.h> +#include <sys/vnode.h> + +#include <vm/as.h> +#include <vm/seg_vn.h> + +#include <sys/gfs.h> + +/* + * Generic pseudo-filesystem routines. + * + * There are significant similarities between the implementation of certain file + * system entry points across different filesystems. While one could attempt to + * "choke up on the bat" and incorporate common functionality into a VOP + * preamble or postamble, such an approach is limited in the benefit it can + * provide. In this file we instead define a toolkit of routines which can be + * called from a filesystem (with in-kernel pseudo-filesystems being the focus + * of the exercise) in a more component-like fashion. + * + * There are three basic classes of routines: + * + * 1) Lowlevel support routines + * + * These routines are designed to play a support role for existing + * pseudo-filesystems (such as procfs). They simplify common tasks, + * without forcing the filesystem to hand over management to GFS. The + * routines covered are: + * + * gfs_readdir_init() + * gfs_readdir_emit() + * gfs_readdir_emitn() + * gfs_readdir_pred() + * gfs_readdir_fini() + * gfs_lookup_dot() + * + * 2) Complete GFS management + * + * These routines take a more active role in management of the + * pseudo-filesystem. They handle the relationship between vnode private + * data and VFS data, as well as the relationship between vnodes in the + * directory hierarchy. + * + * In order to use these interfaces, the first member of every private + * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control + * to GFS. + * + * gfs_file_create() + * gfs_dir_create() + * gfs_root_create() + * + * gfs_file_inactive() + * gfs_dir_inactive() + * gfs_dir_lookup() + * gfs_dir_readdir() + * + * gfs_vop_inactive() + * gfs_vop_lookup() + * gfs_vop_readdir() + * gfs_vop_map() + * + * 3) Single File pseudo-filesystems + * + * This routine creates a rooted file to be overlayed ontop of another + * file in the physical filespace. + * + * Note that the parent is NULL (actually the vfs), but there is nothing + * technically keeping such a file from utilizing the "Complete GFS + * management" set of routines. + * + * gfs_root_create_file() + */ + +/* + * gfs_make_opsvec: take an array of vnode type definitions and create + * their vnodeops_t structures + * + * This routine takes an array of gfs_opsvec_t's. It could + * alternatively take an array of gfs_opsvec_t*'s, which would allow + * vnode types to be completely defined in files external to the caller + * of gfs_make_opsvec(). As it stands, much more sharing takes place -- + * both the caller and the vnode type provider need to access gfsv_ops + * and gfsv_template, and the caller also needs to know gfsv_name. + */ +int +gfs_make_opsvec(gfs_opsvec_t *vec) +{ + int error, i; + + for (i = 0; ; i++) { + if (vec[i].gfsv_name == NULL) + return (0); + error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, + vec[i].gfsv_ops); + if (error) + break; + } + + cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", + vec[i].gfsv_name); + for (i--; i >= 0; i--) { + vn_freevnodeops(*vec[i].gfsv_ops); + *vec[i].gfsv_ops = NULL; + } + return (error); +} + +/* + * Low level directory routines + * + * These routines provide some simple abstractions for reading directories. + * They are designed to be used by existing pseudo filesystems (namely procfs) + * that already have a complicated management infrastructure. + */ + +/* + * gfs_get_parent_ino: used to obtain a parent inode number and the + * inode number of the given vnode in preparation for calling gfs_readdir_init. + */ +int +gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + ino64_t *pino, ino64_t *ino) +{ + vnode_t *parent; + gfs_dir_t *dp = dvp->v_data; + int error; + + *ino = dp->gfsd_file.gfs_ino; + parent = dp->gfsd_file.gfs_parent; + + if (parent == NULL) { + *pino = *ino; /* root of filesystem */ + } else if (dvp->v_flag & V_XATTRDIR) { + vattr_t va; + + va.va_mask = AT_NODEID; + error = VOP_GETATTR(parent, &va, 0, cr, ct); + if (error) + return (error); + *pino = va.va_nodeid; + } else { + *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; + } + + return (0); +} + +/* + * gfs_readdir_init: initiate a generic readdir + * st - a pointer to an uninitialized gfs_readdir_state_t structure + * name_max - the directory's maximum file name length + * ureclen - the exported file-space record length (1 for non-legacy FSs) + * uiop - the uiop passed to readdir + * parent - the parent directory's inode + * self - this directory's inode + * flags - flags from VOP_READDIR + * + * Returns 0 or a non-zero errno. + * + * Typical VOP_READDIR usage of gfs_readdir_*: + * + * if ((error = gfs_readdir_init(...)) != 0) + * return (error); + * eof = 0; + * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { + * if (!consumer_entry_at(voffset)) + * voffset = consumer_next_entry(voffset); + * if (consumer_eof(voffset)) { + * eof = 1 + * break; + * } + * if ((error = gfs_readdir_emit(..., voffset, + * consumer_ino(voffset), consumer_name(voffset))) != 0) + * break; + * } + * return (gfs_readdir_fini(..., error, eofp, eof)); + * + * As you can see, a zero result from gfs_readdir_pred() or + * gfs_readdir_emit() indicates that processing should continue, + * whereas a non-zero result indicates that the loop should terminate. + * Most consumers need do nothing more than let gfs_readdir_fini() + * determine what the cause of failure was and return the appropriate + * value. + */ +int +gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, + uio_t *uiop, ino64_t parent, ino64_t self, int flags) +{ + size_t dirent_size; + + if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || + (uiop->uio_loffset % ureclen) != 0) + return (EINVAL); + + st->grd_ureclen = ureclen; + st->grd_oresid = uiop->uio_resid; + st->grd_namlen = name_max; + if (flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); + st->grd_parent = parent; + st->grd_self = self; + st->grd_flags = flags; + + return (0); +} + +/* + * gfs_readdir_emit_int: internal routine to emit directory entry + * + * st - the current readdir state, which must have d_ino/ed_ino + * and d_name/ed_name set + * uiop - caller-supplied uio pointer + * next - the offset of the next entry + */ +static int +gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next) +{ + int reclen; + dirent64_t *dp; + edirent_t *edp; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp = st->grd_dirent; + reclen = EDIRENT_RECLEN(strlen(edp->ed_name)); + } else { + dp = st->grd_dirent; + reclen = DIRENT64_RECLEN(strlen(dp->d_name)); + } + + if (reclen > uiop->uio_resid) { + /* + * Error if no entries were returned yet + */ + if (uiop->uio_resid == st->grd_oresid) + return (EINVAL); + return (-1); + } + + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp->ed_off = next; + edp->ed_reclen = (ushort_t)reclen; + } else { + dp->d_off = next; + dp->d_reclen = (ushort_t)reclen; + } + + if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) + return (EFAULT); + + uiop->uio_loffset = next; + + return (0); +} + +/* + * gfs_readdir_emit: emit a directory entry + * voff - the virtual offset (obtained from gfs_readdir_pred) + * ino - the entry's inode + * name - the entry's name + * eflags - value for ed_eflags (if processing edirent_t) + * + * Returns a 0 on success, a non-zero errno on failure, or -1 if the + * readdir loop should terminate. A non-zero result (either errno or + * -1) from this function is typically passed directly to + * gfs_readdir_fini(). + */ +int +gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, + ino64_t ino, const char *name, int eflags) +{ + offset_t off = (voff + 2) * st->grd_ureclen; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edirent_t *edp = st->grd_dirent; + + edp->ed_ino = ino; + (void) strncpy(edp->ed_name, name, st->grd_namlen); + edp->ed_eflags = eflags; + } else { + dirent64_t *dp = st->grd_dirent; + + dp->d_ino = ino; + (void) strncpy(dp->d_name, name, st->grd_namlen); + } + + /* + * Inter-entry offsets are invalid, so we assume a record size of + * grd_ureclen and explicitly set the offset appropriately. + */ + return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen)); +} + +/* + * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer + * instead of a string for the entry's name. + */ +int +gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, + ino64_t ino, unsigned long num) +{ + char buf[40]; + + numtos(num, buf); + return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); +} + +/* + * gfs_readdir_pred: readdir loop predicate + * voffp - a pointer in which the next virtual offset should be stored + * + * Returns a 0 on success, a non-zero errno on failure, or -1 if the + * readdir loop should terminate. A non-zero result (either errno or + * -1) from this function is typically passed directly to + * gfs_readdir_fini(). + */ +int +gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp) +{ + offset_t off, voff; + int error; + +top: + if (uiop->uio_resid <= 0) + return (-1); + + off = uiop->uio_loffset / st->grd_ureclen; + voff = off - 2; + if (off == 0) { + if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, + ".", 0)) == 0) + goto top; + } else if (off == 1) { + if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, + "..", 0)) == 0) + goto top; + } else { + *voffp = voff; + return (0); + } + + return (error); +} + +/* + * gfs_readdir_fini: generic readdir cleanup + * error - if positive, an error to return + * eofp - the eofp passed to readdir + * eof - the eof value + * + * Returns a 0 on success, a non-zero errno on failure. This result + * should be returned from readdir. + */ +int +gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) +{ + size_t dirent_size; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + kmem_free(st->grd_dirent, dirent_size); + if (error > 0) + return (error); + if (eofp) + *eofp = eof; + return (0); +} + +/* + * gfs_lookup_dot + * + * Performs a basic check for "." and ".." directory entries. + */ +int +gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) +{ + if (*nm == '\0' || strcmp(nm, ".") == 0) { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } else if (strcmp(nm, "..") == 0) { + if (pvp == NULL) { + ASSERT(dvp->v_flag & VROOT); + VN_HOLD(dvp); + *vpp = dvp; + } else { + VN_HOLD(pvp); + *vpp = pvp; + } + return (0); + } + + return (-1); +} + +/* + * gfs_file_create(): create a new GFS file + * + * size - size of private data structure (v_data) + * pvp - parent vnode (GFS directory) + * ops - vnode operations vector + * + * In order to use this interface, the parent vnode must have been created by + * gfs_dir_create(), and the private data stored in v_data must have a + * 'gfs_file_t' as its first field. + * + * Given these constraints, this routine will automatically: + * + * - Allocate v_data for the vnode + * - Initialize necessary fields in the vnode + * - Hold the parent + */ +vnode_t * +gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops) +{ + gfs_file_t *fp; + vnode_t *vp; + + /* + * Allocate vnode and internal data structure + */ + fp = kmem_zalloc(size, KM_SLEEP); + vp = vn_alloc(KM_SLEEP); + + /* + * Set up various pointers + */ + fp->gfs_vnode = vp; + fp->gfs_parent = pvp; + vp->v_data = fp; + fp->gfs_size = size; + fp->gfs_type = GFS_FILE; + + /* + * Initialize vnode and hold parent. + */ + vn_setops(vp, ops); + if (pvp) { + VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0); + VN_HOLD(pvp); + } + + return (vp); +} + +/* + * gfs_dir_create: creates a new directory in the parent + * + * size - size of private data structure (v_data) + * pvp - parent vnode (GFS directory) + * ops - vnode operations vector + * entries - NULL-terminated list of static entries (if any) + * maxlen - maximum length of a directory entry + * readdir_cb - readdir callback (see gfs_dir_readdir) + * inode_cb - inode callback (see gfs_dir_readdir) + * lookup_cb - lookup callback (see gfs_dir_lookup) + * + * In order to use this function, the first member of the private vnode + * structure (v_data) must be a gfs_dir_t. For each directory, there are + * static entries, defined when the structure is initialized, and dynamic + * entries, retrieved through callbacks. + * + * If a directory has static entries, then it must supply a inode callback, + * which will compute the inode number based on the parent and the index. + * For a directory with dynamic entries, the caller must supply a readdir + * callback and a lookup callback. If a static lookup fails, we fall back to + * the supplied lookup callback, if any. + * + * This function also performs the same initialization as gfs_file_create(). + */ +vnode_t * +gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops, + gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, + gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) +{ + vnode_t *vp; + gfs_dir_t *dp; + gfs_dirent_t *de; + + vp = gfs_file_create(struct_size, pvp, ops); + vp->v_type = VDIR; + + dp = vp->v_data; + dp->gfsd_file.gfs_type = GFS_DIR; + dp->gfsd_maxlen = maxlen; + + if (entries != NULL) { + for (de = entries; de->gfse_name != NULL; de++) + dp->gfsd_nstatic++; + + dp->gfsd_static = kmem_alloc( + dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); + bcopy(entries, dp->gfsd_static, + dp->gfsd_nstatic * sizeof (gfs_dirent_t)); + } + + dp->gfsd_readdir = readdir_cb; + dp->gfsd_lookup = lookup_cb; + dp->gfsd_inode = inode_cb; + + mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); + + return (vp); +} + +/* + * gfs_root_create(): create a root vnode for a GFS filesystem + * + * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The + * only difference is that it takes a vfs_t instead of a vnode_t as its parent. + */ +vnode_t * +gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, + gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, + gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) +{ + vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb, + maxlen, readdir_cb, lookup_cb); + + /* Manually set the inode */ + ((gfs_file_t *)vp->v_data)->gfs_ino = ino; + + VFS_HOLD(vfsp); + VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0); + vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; + + return (vp); +} + +/* + * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem + * + * Similar to gfs_root_create(), this creates a root vnode for a file to + * be the pseudo-filesystem. + */ +vnode_t * +gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) +{ + vnode_t *vp = gfs_file_create(size, NULL, ops); + + ((gfs_file_t *)vp->v_data)->gfs_ino = ino; + + VFS_HOLD(vfsp); + VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); + vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; + + return (vp); +} + +/* + * gfs_file_inactive() + * + * Called from the VOP_INACTIVE() routine. If necessary, this routine will + * remove the given vnode from the parent directory and clean up any references + * in the VFS layer. + * + * If the vnode was not removed (due to a race with vget), then NULL is + * returned. Otherwise, a pointer to the private data is returned. + */ +void * +gfs_file_inactive(vnode_t *vp) +{ + int i; + gfs_dirent_t *ge = NULL; + gfs_file_t *fp = vp->v_data; + gfs_dir_t *dp = NULL; + void *data; + + if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) + goto found; + + dp = fp->gfs_parent->v_data; + + /* + * First, see if this vnode is cached in the parent. + */ + gfs_dir_lock(dp); + + /* + * Find it in the set of static entries. + */ + for (i = 0; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (ge->gfse_vnode == vp) + goto found; + } + + /* + * If 'ge' is NULL, then it is a dynamic entry. + */ + ge = NULL; + +found: + if (vp->v_flag & V_XATTRDIR) { + mutex_enter(&fp->gfs_parent->v_lock); + } + mutex_enter(&vp->v_lock); + if (vp->v_count == 1) { + /* + * Really remove this vnode + */ + data = vp->v_data; + if (ge != NULL) { + /* + * If this was a statically cached entry, simply set the + * cached vnode to NULL. + */ + ge->gfse_vnode = NULL; + } + if (vp->v_flag & V_XATTRDIR) { + fp->gfs_parent->v_xattrdir = NULL; + mutex_exit(&fp->gfs_parent->v_lock); + } + mutex_exit(&vp->v_lock); + + /* + * Free vnode and release parent + */ + if (fp->gfs_parent) { + if (dp) { + gfs_dir_unlock(dp); + } + VN_RELE(fp->gfs_parent); + } else { + ASSERT(vp->v_vfsp != NULL); + VFS_RELE(vp->v_vfsp); + } + vn_free(vp); + } else { + vp->v_count--; + data = NULL; + mutex_exit(&vp->v_lock); + if (vp->v_flag & V_XATTRDIR) { + mutex_exit(&fp->gfs_parent->v_lock); + } + if (dp) + gfs_dir_unlock(dp); + } + + return (data); +} + +/* + * gfs_dir_inactive() + * + * Same as above, but for directories. + */ +void * +gfs_dir_inactive(vnode_t *vp) +{ + gfs_dir_t *dp; + + ASSERT(vp->v_type == VDIR); + + if ((dp = gfs_file_inactive(vp)) != NULL) { + mutex_destroy(&dp->gfsd_lock); + if (dp->gfsd_nstatic) + kmem_free(dp->gfsd_static, + dp->gfsd_nstatic * sizeof (gfs_dirent_t)); + } + + return (dp); +} + +/* + * gfs_dir_lookup_dynamic() + * + * This routine looks up the provided name amongst the dynamic entries + * in the gfs directory and returns the corresponding vnode, if found. + * + * The gfs directory is expected to be locked by the caller prior to + * calling this function. The directory will be unlocked during the + * execution of this function, but will be locked upon return from the + * function. This function returns 0 on success, non-zero on error. + * + * The dynamic lookups are performed by invoking the lookup + * callback, which is passed to this function as the first argument. + * The arguments to the callback are: + * + * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, + * int flags, int *deflgs, pathname_t *rpnp); + * + * pvp - parent vnode + * nm - name of entry + * vpp - pointer to resulting vnode + * cr - pointer to cred + * flags - flags value from lookup request + * ignored here; currently only used to request + * insensitive lookups + * direntflgs - output parameter, directory entry flags + * ignored here; currently only used to indicate a lookup + * has more than one possible match when case is not considered + * realpnp - output parameter, real pathname + * ignored here; when lookup was performed case-insensitively, + * this field contains the "real" name of the file. + * + * Returns 0 on success, non-zero on error. + */ +static int +gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, + const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, + int *direntflags, pathname_t *realpnp) +{ + gfs_file_t *fp; + ino64_t ino; + int ret; + + ASSERT(GFS_DIR_LOCKED(dp)); + + /* + * Drop the directory lock, as the lookup routine + * will need to allocate memory, or otherwise deadlock on this + * directory. + */ + gfs_dir_unlock(dp); + ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); + gfs_dir_lock(dp); + + /* + * The callback for extended attributes returns a vnode + * with v_data from an underlying fs. + */ + if (ret == 0 && !IS_XATTRDIR(dvp)) { + fp = (gfs_file_t *)((*vpp)->v_data); + fp->gfs_index = -1; + fp->gfs_ino = ino; + } + + return (ret); +} + +/* + * gfs_dir_lookup_static() + * + * This routine looks up the provided name amongst the static entries + * in the gfs directory and returns the corresponding vnode, if found. + * The first argument to the function is a pointer to the comparison + * function this function should use to decide if names are a match. + * + * If a match is found, and GFS_CACHE_VNODE is set and the vnode + * exists, we simply return the existing vnode. Otherwise, we call + * the static entry's callback routine, caching the result if + * necessary. If the idx pointer argument is non-NULL, we use it to + * return the index of the matching static entry. + * + * The gfs directory is expected to be locked by the caller prior to calling + * this function. The directory may be unlocked during the execution of + * this function, but will be locked upon return from the function. + * + * This function returns 0 if a match is found, ENOENT if not. + */ +static int +gfs_dir_lookup_static(int (*compare)(const char *, const char *), + gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, + vnode_t **vpp, pathname_t *rpnp) +{ + gfs_dirent_t *ge; + vnode_t *vp = NULL; + int i; + + ASSERT(GFS_DIR_LOCKED(dp)); + + /* + * Search static entries. + */ + for (i = 0; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (compare(ge->gfse_name, nm) == 0) { + if (rpnp) + (void) strlcpy(rpnp->pn_buf, ge->gfse_name, + rpnp->pn_bufsize); + + if (ge->gfse_vnode) { + ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); + vp = ge->gfse_vnode; + VN_HOLD(vp); + break; + } + + /* + * We drop the directory lock, as the constructor will + * need to do KM_SLEEP allocations. If we return from + * the constructor only to find that a parallel + * operation has completed, and GFS_CACHE_VNODE is set + * for this entry, we discard the result in favor of + * the cached vnode. + */ + gfs_dir_unlock(dp); + vp = ge->gfse_ctor(dvp); + gfs_dir_lock(dp); + + ((gfs_file_t *)vp->v_data)->gfs_index = i; + + /* Set the inode according to the callback. */ + ((gfs_file_t *)vp->v_data)->gfs_ino = + dp->gfsd_inode(dvp, i); + + if (ge->gfse_flags & GFS_CACHE_VNODE) { + if (ge->gfse_vnode == NULL) { + ge->gfse_vnode = vp; + } else { + /* + * A parallel constructor beat us to it; + * return existing vnode. We have to be + * careful because we can't release the + * current vnode while holding the + * directory lock; its inactive routine + * will try to lock this directory. + */ + vnode_t *oldvp = vp; + vp = ge->gfse_vnode; + VN_HOLD(vp); + + gfs_dir_unlock(dp); + VN_RELE(oldvp); + gfs_dir_lock(dp); + } + } + break; + } + } + + if (vp == NULL) + return (ENOENT); + else if (idx) + *idx = i; + *vpp = vp; + return (0); +} + +/* + * gfs_dir_lookup() + * + * Looks up the given name in the directory and returns the corresponding + * vnode, if found. + * + * First, we search statically defined entries, if any, with a call to + * gfs_dir_lookup_static(). If no static entry is found, and we have + * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). + * + * This function returns 0 on success, non-zero on error. + */ +int +gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, + int flags, int *direntflags, pathname_t *realpnp) +{ + gfs_dir_t *dp = dvp->v_data; + boolean_t casecheck; + vnode_t *dynvp = NULL; + vnode_t *vp = NULL; + int (*compare)(const char *, const char *); + int error, idx; + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) + return (0); + + casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; + if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || + (flags & FIGNORECASE)) + compare = strcasecmp; + else + compare = strcmp; + + gfs_dir_lock(dp); + + error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); + + if (vp && casecheck) { + gfs_dirent_t *ge; + int i; + + for (i = idx + 1; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (strcasecmp(ge->gfse_name, nm) == 0) { + *direntflags |= ED_CASE_CONFLICT; + goto out; + } + } + } + + if ((error || casecheck) && dp->gfsd_lookup) + error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, + &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); + + if (vp && dynvp) { + /* static and dynamic entries are case-insensitive conflict */ + ASSERT(casecheck); + *direntflags |= ED_CASE_CONFLICT; + VN_RELE(dynvp); + } else if (vp == NULL) { + vp = dynvp; + } else if (error == ENOENT) { + error = 0; + } else if (error) { + VN_RELE(vp); + vp = NULL; + } + +out: + gfs_dir_unlock(dp); + + *vpp = vp; + return (error); +} + +/* + * gfs_dir_readdir: does a readdir() on the given directory + * + * dvp - directory vnode + * uiop - uio structure + * eofp - eof pointer + * data - arbitrary data passed to readdir callback + * + * This routine does all the readdir() dirty work. Even so, the caller must + * supply two callbacks in order to get full compatibility. + * + * If the directory contains static entries, an inode callback must be + * specified. This avoids having to create every vnode and call VOP_GETATTR() + * when reading the directory. This function has the following arguments: + * + * ino_t gfs_inode_cb(vnode_t *vp, int index); + * + * vp - vnode for the directory + * index - index in original gfs_dirent_t array + * + * Returns the inode number for the given entry. + * + * For directories with dynamic entries, a readdir callback must be provided. + * This is significantly more complex, thanks to the particulars of + * VOP_READDIR(). + * + * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, + * offset_t *off, offset_t *nextoff, void *data, int flags) + * + * vp - directory vnode + * dp - directory entry, sized according to maxlen given to + * gfs_dir_create(). callback must fill in d_name and + * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags + * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS + * is set in 'flags'. + * eofp - callback must set to 1 when EOF has been reached + * off - on entry, the last offset read from the directory. Callback + * must set to the offset of the current entry, typically left + * untouched. + * nextoff - callback must set to offset of next entry. Typically + * (off + 1) + * data - caller-supplied data + * flags - VOP_READDIR flags + * + * Return 0 on success, or error on failure. + */ +int +gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr, + caller_context_t *ct, int flags) +{ + gfs_readdir_state_t gstate; + int error, eof = 0; + ino64_t ino, pino; + offset_t off, next; + gfs_dir_t *dp = dvp->v_data; + + error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino); + if (error) + return (error); + + if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, + pino, ino, flags)) != 0) + return (error); + + while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 && + !eof) { + + if (off >= 0 && off < dp->gfsd_nstatic) { + ino = dp->gfsd_inode(dvp, off); + + if ((error = gfs_readdir_emit(&gstate, uiop, + off, ino, dp->gfsd_static[off].gfse_name, 0)) + != 0) + break; + + } else if (dp->gfsd_readdir) { + off -= dp->gfsd_nstatic; + + if ((error = dp->gfsd_readdir(dvp, + gstate.grd_dirent, &eof, &off, &next, + data, flags)) != 0 || eof) + break; + + off += dp->gfsd_nstatic + 2; + next += dp->gfsd_nstatic + 2; + + if ((error = gfs_readdir_emit_int(&gstate, uiop, + next)) != 0) + break; + } else { + /* + * Offset is beyond the end of the static entries, and + * we have no dynamic entries. Set EOF. + */ + eof = 1; + } + } + + return (gfs_readdir_fini(&gstate, error, eofp, eof)); +} + + +/* + * gfs_vop_lookup: VOP_LOOKUP() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_lookup() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); +} + +/* + * gfs_vop_readdir: VOP_READDIR() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_readdir() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags)); +} + + +/* + * gfs_vop_map: VOP_MAP() entry point + * + * Convenient routine for handling pseudo-files that wish to allow mmap() calls. + * This function only works for readonly files, and uses the read function for + * the vnode to fill in the data. The mapped data is immediately faulted in and + * filled with the necessary data during this call; there are no getpage() or + * putpage() routines. + */ +/* ARGSUSED */ +int +gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, + caller_context_t *ct) +{ + int rv; + ssize_t resid = len; + + /* + * Check for bad parameters + */ +#ifdef _ILP32 + if (len > MAXOFF_T) + return (ENOMEM); +#endif + if (vp->v_flag & VNOMAP) + return (ENOTSUP); + if (off > MAXOFF_T) + return (EFBIG); + if ((long)off < 0 || (long)(off + len) < 0) + return (EINVAL); + if (vp->v_type != VREG) + return (ENODEV); + if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) + return (EACCES); + + /* + * Find appropriate address if needed, otherwise clear address range. + */ + as_rangelock(as); + rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (rv != 0) { + as_rangeunlock(as); + return (rv); + } + + /* + * Create mapping + */ + rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); + as_rangeunlock(as); + if (rv != 0) + return (rv); + + /* + * Fill with data from read() + */ + rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, + 0, (rlim64_t)0, cred, &resid); + + if (rv == 0 && resid != 0) + rv = ENXIO; + + if (rv != 0) { + as_rangelock(as); + (void) as_unmap(as, *addrp, len); + as_rangeunlock(as); + } + + return (rv); +} + +/* + * gfs_vop_inactive: VOP_INACTIVE() entry point + * + * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or + * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. + */ +/* ARGSUSED */ +void +gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + gfs_file_t *fp = vp->v_data; + void *data; + + if (fp->gfs_type == GFS_DIR) + data = gfs_dir_inactive(vp); + else + data = gfs_file_inactive(vp); + + if (data != NULL) + kmem_free(data, fp->gfs_size); +} diff --git a/uts/common/fs/vnode.c b/uts/common/fs/vnode.c new file mode 100644 index 000000000000..382369c7fc72 --- /dev/null +++ b/uts/common/fs/vnode.c @@ -0,0 +1,4536 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/uio.h> +#include <sys/file.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/rwstlock.h> +#include <sys/fem.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/conf.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/systm.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <c2/audit.h> +#include <sys/acl.h> +#include <sys/nbmlock.h> +#include <sys/fcntl.h> +#include <fs/fs_subr.h> +#include <sys/taskq.h> +#include <fs/fs_reparse.h> + +/* Determine if this vnode is a file that is read-only */ +#define ISROFILE(vp) \ + ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \ + (vp)->v_type != VFIFO && vn_is_readonly(vp)) + +/* Tunable via /etc/system; used only by admin/install */ +int nfs_global_client_only; + +/* + * Array of vopstats_t for per-FS-type vopstats. This array has the same + * number of entries as and parallel to the vfssw table. (Arguably, it could + * be part of the vfssw table.) Once it's initialized, it's accessed using + * the same fstype index that is used to index into the vfssw table. + */ +vopstats_t **vopstats_fstype; + +/* vopstats initialization template used for fast initialization via bcopy() */ +static vopstats_t *vs_templatep; + +/* Kmem cache handle for vsk_anchor_t allocations */ +kmem_cache_t *vsk_anchor_cache; + +/* file events cleanup routine */ +extern void free_fopdata(vnode_t *); + +/* + * Root of AVL tree for the kstats associated with vopstats. Lock protects + * updates to vsktat_tree. + */ +avl_tree_t vskstat_tree; +kmutex_t vskstat_tree_lock; + +/* Global variable which enables/disables the vopstats collection */ +int vopstats_enabled = 1; + +/* + * forward declarations for internal vnode specific data (vsd) + */ +static void *vsd_realloc(void *, size_t, size_t); + +/* + * forward declarations for reparse point functions + */ +static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr); + +/* + * VSD -- VNODE SPECIFIC DATA + * The v_data pointer is typically used by a file system to store a + * pointer to the file system's private node (e.g. ufs inode, nfs rnode). + * However, there are times when additional project private data needs + * to be stored separately from the data (node) pointed to by v_data. + * This additional data could be stored by the file system itself or + * by a completely different kernel entity. VSD provides a way for + * callers to obtain a key and store a pointer to private data associated + * with a vnode. + * + * Callers are responsible for protecting the vsd by holding v_vsd_lock + * for calls to vsd_set() and vsd_get(). + */ + +/* + * vsd_lock protects: + * vsd_nkeys - creation and deletion of vsd keys + * vsd_list - insertion and deletion of vsd_node in the vsd_list + * vsd_destructor - adding and removing destructors to the list + */ +static kmutex_t vsd_lock; +static uint_t vsd_nkeys; /* size of destructor array */ +/* list of vsd_node's */ +static list_t *vsd_list = NULL; +/* per-key destructor funcs */ +static void (**vsd_destructor)(void *); + +/* + * The following is the common set of actions needed to update the + * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and + * VOPSTATS_UPDATE_IO() do almost the same thing, except for the + * recording of the bytes transferred. Since the code is similar + * but small, it is nearly a duplicate. Consequently any changes + * to one may need to be reflected in the other. + * Rundown of the variables: + * vp - Pointer to the vnode + * counter - Partial name structure member to update in vopstats for counts + * bytecounter - Partial name structure member to update in vopstats for bytes + * bytesval - Value to update in vopstats for bytes + * fstype - Index into vsanchor_fstype[], same as index into vfssw[] + * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i]) + */ + +#define VOPSTATS_UPDATE(vp, counter) { \ + vfs_t *vfsp = (vp)->v_vfsp; \ + if (vfsp && vfsp->vfs_implp && \ + (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ + vopstats_t *vsp = &vfsp->vfs_vopstats; \ + uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ + extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ + size_t, uint64_t *); \ + __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \ + (*stataddr)++; \ + if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ + vsp->n##counter.value.ui64++; \ + } \ + } \ +} + +#define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \ + vfs_t *vfsp = (vp)->v_vfsp; \ + if (vfsp && vfsp->vfs_implp && \ + (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ + vopstats_t *vsp = &vfsp->vfs_vopstats; \ + uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ + extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ + size_t, uint64_t *); \ + __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \ + (*stataddr)++; \ + vsp->bytecounter.value.ui64 += bytesval; \ + if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ + vsp->n##counter.value.ui64++; \ + vsp->bytecounter.value.ui64 += bytesval; \ + } \ + } \ +} + +/* + * If the filesystem does not support XIDs map credential + * If the vfsp is NULL, perhaps we should also map? + */ +#define VOPXID_MAP_CR(vp, cr) { \ + vfs_t *vfsp = (vp)->v_vfsp; \ + if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \ + cr = crgetmapped(cr); \ + } + +/* + * Convert stat(2) formats to vnode types and vice versa. (Knows about + * numerical order of S_IFMT and vnode types.) + */ +enum vtype iftovt_tab[] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON +}; + +ushort_t vttoif_tab[] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, + S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0 +}; + +/* + * The system vnode cache. + */ + +kmem_cache_t *vn_cache; + + +/* + * Vnode operations vector. + */ + +static const fs_operation_trans_def_t vn_ops_table[] = { + VOPNAME_OPEN, offsetof(struct vnodeops, vop_open), + fs_nosys, fs_nosys, + + VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close), + fs_nosys, fs_nosys, + + VOPNAME_READ, offsetof(struct vnodeops, vop_read), + fs_nosys, fs_nosys, + + VOPNAME_WRITE, offsetof(struct vnodeops, vop_write), + fs_nosys, fs_nosys, + + VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl), + fs_nosys, fs_nosys, + + VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl), + fs_setfl, fs_nosys, + + VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr), + fs_nosys, fs_nosys, + + VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr), + fs_nosys, fs_nosys, + + VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access), + fs_nosys, fs_nosys, + + VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup), + fs_nosys, fs_nosys, + + VOPNAME_CREATE, offsetof(struct vnodeops, vop_create), + fs_nosys, fs_nosys, + + VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove), + fs_nosys, fs_nosys, + + VOPNAME_LINK, offsetof(struct vnodeops, vop_link), + fs_nosys, fs_nosys, + + VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename), + fs_nosys, fs_nosys, + + VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir), + fs_nosys, fs_nosys, + + VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir), + fs_nosys, fs_nosys, + + VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir), + fs_nosys, fs_nosys, + + VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink), + fs_nosys, fs_nosys, + + VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink), + fs_nosys, fs_nosys, + + VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync), + fs_nosys, fs_nosys, + + VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive), + fs_nosys, fs_nosys, + + VOPNAME_FID, offsetof(struct vnodeops, vop_fid), + fs_nosys, fs_nosys, + + VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock), + fs_rwlock, fs_rwlock, + + VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock), + (fs_generic_func_p) fs_rwunlock, + (fs_generic_func_p) fs_rwunlock, /* no errors allowed */ + + VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek), + fs_nosys, fs_nosys, + + VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp), + fs_cmp, fs_cmp, /* no errors allowed */ + + VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock), + fs_frlock, fs_nosys, + + VOPNAME_SPACE, offsetof(struct vnodeops, vop_space), + fs_nosys, fs_nosys, + + VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp), + fs_nosys, fs_nosys, + + VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage), + fs_nosys, fs_nosys, + + VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage), + fs_nosys, fs_nosys, + + VOPNAME_MAP, offsetof(struct vnodeops, vop_map), + (fs_generic_func_p) fs_nosys_map, + (fs_generic_func_p) fs_nosys_map, + + VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap), + (fs_generic_func_p) fs_nosys_addmap, + (fs_generic_func_p) fs_nosys_addmap, + + VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap), + fs_nosys, fs_nosys, + + VOPNAME_POLL, offsetof(struct vnodeops, vop_poll), + (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll, + + VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump), + fs_nosys, fs_nosys, + + VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf), + fs_pathconf, fs_nosys, + + VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio), + fs_nosys, fs_nosys, + + VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl), + fs_nosys, fs_nosys, + + VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose), + (fs_generic_func_p) fs_dispose, + (fs_generic_func_p) fs_nodispose, + + VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr), + fs_nosys, fs_nosys, + + VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr), + fs_fab_acl, fs_nosys, + + VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock), + fs_shrlock, fs_nosys, + + VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent), + (fs_generic_func_p) fs_vnevent_nosupport, + (fs_generic_func_p) fs_vnevent_nosupport, + + VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), + fs_nosys, fs_nosys, + + VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), + fs_nosys, fs_nosys, + + NULL, 0, NULL, NULL +}; + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} + +/* + * Used by the AVL routines to compare two vsk_anchor_t structures in the tree. + * We use the f_fsid reported by VFS_STATVFS() since we use that for the + * kstat name. + */ +static int +vska_compar(const void *n1, const void *n2) +{ + int ret; + ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid; + ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid; + + if (p1 < p2) { + ret = -1; + } else if (p1 > p2) { + ret = 1; + } else { + ret = 0; + } + + return (ret); +} + +/* + * Used to create a single template which will be bcopy()ed to a newly + * allocated vsanchor_combo_t structure in new_vsanchor(), below. + */ +static vopstats_t * +create_vopstats_template() +{ + vopstats_t *vsp; + + vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP); + bzero(vsp, sizeof (*vsp)); /* Start fresh */ + + /* VOP_OPEN */ + kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64); + /* VOP_CLOSE */ + kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64); + /* VOP_READ I/O */ + kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64); + /* VOP_WRITE I/O */ + kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64); + /* VOP_IOCTL */ + kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64); + /* VOP_SETFL */ + kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64); + /* VOP_GETATTR */ + kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64); + /* VOP_SETATTR */ + kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64); + /* VOP_ACCESS */ + kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64); + /* VOP_LOOKUP */ + kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64); + /* VOP_CREATE */ + kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64); + /* VOP_REMOVE */ + kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64); + /* VOP_LINK */ + kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64); + /* VOP_RENAME */ + kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64); + /* VOP_MKDIR */ + kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64); + /* VOP_RMDIR */ + kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64); + /* VOP_READDIR I/O */ + kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->readdir_bytes, "readdir_bytes", + KSTAT_DATA_UINT64); + /* VOP_SYMLINK */ + kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64); + /* VOP_READLINK */ + kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64); + /* VOP_FSYNC */ + kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64); + /* VOP_INACTIVE */ + kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64); + /* VOP_FID */ + kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64); + /* VOP_RWLOCK */ + kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64); + /* VOP_RWUNLOCK */ + kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64); + /* VOP_SEEK */ + kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64); + /* VOP_CMP */ + kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64); + /* VOP_FRLOCK */ + kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64); + /* VOP_SPACE */ + kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64); + /* VOP_REALVP */ + kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64); + /* VOP_GETPAGE */ + kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64); + /* VOP_PUTPAGE */ + kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64); + /* VOP_MAP */ + kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64); + /* VOP_ADDMAP */ + kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64); + /* VOP_DELMAP */ + kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64); + /* VOP_POLL */ + kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64); + /* VOP_DUMP */ + kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64); + /* VOP_PATHCONF */ + kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64); + /* VOP_PAGEIO */ + kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64); + /* VOP_DUMPCTL */ + kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64); + /* VOP_DISPOSE */ + kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64); + /* VOP_SETSECATTR */ + kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64); + /* VOP_GETSECATTR */ + kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64); + /* VOP_SHRLOCK */ + kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); + /* VOP_VNEVENT */ + kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); + /* VOP_REQZCBUF */ + kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); + /* VOP_RETZCBUF */ + kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64); + + return (vsp); +} + +/* + * Creates a kstat structure associated with a vopstats structure. + */ +kstat_t * +new_vskstat(char *ksname, vopstats_t *vsp) +{ + kstat_t *ksp; + + if (!vopstats_enabled) { + return (NULL); + } + + ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED, + sizeof (vopstats_t)/sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + if (ksp) { + ksp->ks_data = vsp; + kstat_install(ksp); + } + + return (ksp); +} + +/* + * Called from vfsinit() to initialize the support mechanisms for vopstats + */ +void +vopstats_startup() +{ + if (!vopstats_enabled) + return; + + /* + * Creates the AVL tree which holds per-vfs vopstat anchors. This + * is necessary since we need to check if a kstat exists before we + * attempt to create it. Also, initialize its lock. + */ + avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t), + offsetof(vsk_anchor_t, vsk_node)); + mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL); + + vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache", + sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL, + NULL, NULL, 0); + + /* + * Set up the array of pointers for the vopstats-by-FS-type. + * The entries will be allocated/initialized as each file system + * goes through modload/mod_installfs. + */ + vopstats_fstype = (vopstats_t **)kmem_zalloc( + (sizeof (vopstats_t *) * nfstype), KM_SLEEP); + + /* Set up the global vopstats initialization template */ + vs_templatep = create_vopstats_template(); +} + +/* + * We need to have the all of the counters zeroed. + * The initialization of the vopstats_t includes on the order of + * 50 calls to kstat_named_init(). Rather that do that on every call, + * we do it once in a template (vs_templatep) then bcopy it over. + */ +void +initialize_vopstats(vopstats_t *vsp) +{ + if (vsp == NULL) + return; + + bcopy(vs_templatep, vsp, sizeof (vopstats_t)); +} + +/* + * If possible, determine which vopstats by fstype to use and + * return a pointer to the caller. + */ +vopstats_t * +get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp) +{ + int fstype = 0; /* Index into vfssw[] */ + vopstats_t *vsp = NULL; + + if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || + !vopstats_enabled) + return (NULL); + /* + * Set up the fstype. We go to so much trouble because all versions + * of NFS use the same fstype in their vfs even though they have + * distinct entries in the vfssw[] table. + * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry. + */ + if (vswp) { + fstype = vswp - vfssw; /* Gets us the index */ + } else { + fstype = vfsp->vfs_fstype; + } + + /* + * Point to the per-fstype vopstats. The only valid values are + * non-zero positive values less than the number of vfssw[] table + * entries. + */ + if (fstype > 0 && fstype < nfstype) { + vsp = vopstats_fstype[fstype]; + } + + return (vsp); +} + +/* + * Generate a kstat name, create the kstat structure, and allocate a + * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t + * to the caller. This must only be called from a mount. + */ +vsk_anchor_t * +get_vskstat_anchor(vfs_t *vfsp) +{ + char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */ + statvfs64_t statvfsbuf; /* Needed to find f_fsid */ + vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */ + kstat_t *ksp; /* Ptr to new kstat */ + avl_index_t where; /* Location in the AVL tree */ + + if (vfsp == NULL || vfsp->vfs_implp == NULL || + (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) + return (NULL); + + /* Need to get the fsid to build a kstat name */ + if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) { + /* Create a name for our kstats based on fsid */ + (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx", + VOPSTATS_STR, statvfsbuf.f_fsid); + + /* Allocate and initialize the vsk_anchor_t */ + vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP); + bzero(vskp, sizeof (*vskp)); + vskp->vsk_fsid = statvfsbuf.f_fsid; + + mutex_enter(&vskstat_tree_lock); + if (avl_find(&vskstat_tree, vskp, &where) == NULL) { + avl_insert(&vskstat_tree, vskp, where); + mutex_exit(&vskstat_tree_lock); + + /* + * Now that we've got the anchor in the AVL + * tree, we can create the kstat. + */ + ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats); + if (ksp) { + vskp->vsk_ksp = ksp; + } + } else { + /* Oops, found one! Release memory and lock. */ + mutex_exit(&vskstat_tree_lock); + kmem_cache_free(vsk_anchor_cache, vskp); + vskp = NULL; + } + } + return (vskp); +} + +/* + * We're in the process of tearing down the vfs and need to cleanup + * the data structures associated with the vopstats. Must only be called + * from dounmount(). + */ +void +teardown_vopstats(vfs_t *vfsp) +{ + vsk_anchor_t *vskap; + avl_index_t where; + + if (vfsp == NULL || vfsp->vfs_implp == NULL || + (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) + return; + + /* This is a safe check since VFS_STATS must be set (see above) */ + if ((vskap = vfsp->vfs_vskap) == NULL) + return; + + /* Whack the pointer right away */ + vfsp->vfs_vskap = NULL; + + /* Lock the tree, remove the node, and delete the kstat */ + mutex_enter(&vskstat_tree_lock); + if (avl_find(&vskstat_tree, vskap, &where)) { + avl_remove(&vskstat_tree, vskap); + } + + if (vskap->vsk_ksp) { + kstat_delete(vskap->vsk_ksp); + } + mutex_exit(&vskstat_tree_lock); + + kmem_cache_free(vsk_anchor_cache, vskap); +} + +/* + * Read or write a vnode. Called from kernel code. + */ +int +vn_rdwr( + enum uio_rw rw, + struct vnode *vp, + caddr_t base, + ssize_t len, + offset_t offset, + enum uio_seg seg, + int ioflag, + rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ + cred_t *cr, + ssize_t *residp) +{ + struct uio uio; + struct iovec iov; + int error; + int in_crit = 0; + + if (rw == UIO_WRITE && ISROFILE(vp)) + return (EROFS); + + if (len < 0) + return (EIO); + + VOPXID_MAP_CR(vp, cr); + + iov.iov_base = base; + iov.iov_len = len; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = offset; + uio.uio_segflg = (short)seg; + uio.uio_resid = len; + uio.uio_llimit = ulimit; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = 1; + error = nbl_svmand(vp, cr, &svmand); + if (error != 0) + goto done; + if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ, + uio.uio_offset, uio.uio_resid, svmand, NULL)) { + error = EACCES; + goto done; + } + } + + (void) VOP_RWLOCK(vp, + rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); + if (rw == UIO_WRITE) { + uio.uio_fmode = FWRITE; + uio.uio_extflg = UIO_COPY_DEFAULT; + error = VOP_WRITE(vp, &uio, ioflag, cr, NULL); + } else { + uio.uio_fmode = FREAD; + uio.uio_extflg = UIO_COPY_CACHED; + error = VOP_READ(vp, &uio, ioflag, cr, NULL); + } + VOP_RWUNLOCK(vp, + rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); + if (residp) + *residp = uio.uio_resid; + else if (uio.uio_resid) + error = EIO; + +done: + if (in_crit) + nbl_end_crit(vp); + return (error); +} + +/* + * Release a vnode. Call VOP_INACTIVE on last reference or + * decrement reference count. + * + * To avoid race conditions, the v_count is left at 1 for + * the call to VOP_INACTIVE. This prevents another thread + * from reclaiming and releasing the vnode *before* the + * VOP_INACTIVE routine has a chance to destroy the vnode. + * We can't have more than 1 thread calling VOP_INACTIVE + * on a vnode. + */ +void +vn_rele(vnode_t *vp) +{ + VERIFY(vp->v_count > 0); + mutex_enter(&vp->v_lock); + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } + vp->v_count--; + mutex_exit(&vp->v_lock); +} + +/* + * Release a vnode referenced by the DNLC. Multiple DNLC references are treated + * as a single reference, so v_count is not decremented until the last DNLC hold + * is released. This makes it possible to distinguish vnodes that are referenced + * only by the DNLC. + */ +void +vn_rele_dnlc(vnode_t *vp) +{ + VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); + mutex_enter(&vp->v_lock); + if (--vp->v_count_dnlc == 0) { + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } + vp->v_count--; + } + mutex_exit(&vp->v_lock); +} + +/* + * Like vn_rele() except that it clears v_stream under v_lock. + * This is used by sockfs when it dismantels the association between + * the sockfs node and the vnode in the underlaying file system. + * v_lock has to be held to prevent a thread coming through the lookupname + * path from accessing a stream head that is going away. + */ +void +vn_rele_stream(vnode_t *vp) +{ + VERIFY(vp->v_count > 0); + mutex_enter(&vp->v_lock); + vp->v_stream = NULL; + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } + vp->v_count--; + mutex_exit(&vp->v_lock); +} + +static void +vn_rele_inactive(vnode_t *vp) +{ + VOP_INACTIVE(vp, CRED(), NULL); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + mutex_enter(&vp->v_lock); + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive, + vp, TQ_SLEEP) != NULL); + return; + } + vp->v_count--; + mutex_exit(&vp->v_lock); +} + +int +vn_open( + char *pnamep, + enum uio_seg seg, + int filemode, + int createmode, + struct vnode **vpp, + enum create crwhy, + mode_t umask) +{ + return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, + umask, NULL, -1)); +} + + +/* + * Open/create a vnode. + * This may be callable by the kernel, the only known use + * of user context being that the current user credentials + * are used for permissions. crwhy is defined iff filemode & FCREAT. + */ +int +vn_openat( + char *pnamep, + enum uio_seg seg, + int filemode, + int createmode, + struct vnode **vpp, + enum create crwhy, + mode_t umask, + struct vnode *startvp, + int fd) +{ + struct vnode *vp; + int mode; + int accessflags; + int error; + int in_crit = 0; + int open_done = 0; + int shrlock_done = 0; + struct vattr vattr; + enum symfollow follow; + int estale_retry = 0; + struct shrlock shr; + struct shr_locowner shr_own; + + mode = 0; + accessflags = 0; + if (filemode & FREAD) + mode |= VREAD; + if (filemode & (FWRITE|FTRUNC)) + mode |= VWRITE; + if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN)) + mode |= VEXEC; + + /* symlink interpretation */ + if (filemode & FNOFOLLOW) + follow = NO_FOLLOW; + else + follow = FOLLOW; + + if (filemode & FAPPEND) + accessflags |= V_APPEND; + +top: + if (filemode & FCREAT) { + enum vcexcl excl; + + /* + * Wish to create a file. + */ + vattr.va_type = VREG; + vattr.va_mode = createmode; + vattr.va_mask = AT_TYPE|AT_MODE; + if (filemode & FTRUNC) { + vattr.va_size = 0; + vattr.va_mask |= AT_SIZE; + } + if (filemode & FEXCL) + excl = EXCL; + else + excl = NONEXCL; + + if (error = + vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy, + (filemode & ~(FTRUNC|FEXCL)), umask, startvp)) + return (error); + } else { + /* + * Wish to open a file. Just look it up. + */ + if (error = lookupnameat(pnamep, seg, follow, + NULLVPP, &vp, startvp)) { + if ((error == ESTALE) && + fs_need_estale_retry(estale_retry++)) + goto top; + return (error); + } + + /* + * Get the attributes to check whether file is large. + * We do this only if the FOFFMAX flag is not set and + * only for regular files. + */ + + if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) { + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, + CRED(), NULL))) { + goto out; + } + if (vattr.va_size > (u_offset_t)MAXOFF32_T) { + /* + * Large File API - regular open fails + * if FOFFMAX flag is set in file mode + */ + error = EOVERFLOW; + goto out; + } + } + /* + * Can't write directories, active texts, or + * read-only filesystems. Can't truncate files + * on which mandatory locking is in effect. + */ + if (filemode & (FWRITE|FTRUNC)) { + /* + * Allow writable directory if VDIROPEN flag is set. + */ + if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) { + error = EISDIR; + goto out; + } + if (ISROFILE(vp)) { + error = EROFS; + goto out; + } + /* + * Can't truncate files on which + * sysv mandatory locking is in effect. + */ + if (filemode & FTRUNC) { + vnode_t *rvp; + + if (VOP_REALVP(vp, &rvp, NULL) != 0) + rvp = vp; + if (rvp->v_filocks != NULL) { + vattr.va_mask = AT_MODE; + if ((error = VOP_GETATTR(vp, + &vattr, 0, CRED(), NULL)) == 0 && + MANDLOCK(vp, vattr.va_mode)) + error = EAGAIN; + } + } + if (error) + goto out; + } + /* + * Check permissions. + */ + if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL)) + goto out; + /* + * Require FSEARCH to return a directory. + * Require FEXEC to return a regular file. + */ + if ((filemode & FSEARCH) && vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + if ((filemode & FEXEC) && vp->v_type != VREG) { + error = ENOEXEC; /* XXX: error code? */ + goto out; + } + } + + /* + * Do remaining checks for FNOFOLLOW and FNOLINKS. + */ + if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { + error = ELOOP; + goto out; + } + if (filemode & FNOLINKS) { + vattr.va_mask = AT_NLINK; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { + goto out; + } + if (vattr.va_nlink != 1) { + error = EMLINK; + goto out; + } + } + + /* + * Opening a socket corresponding to the AF_UNIX pathname + * in the filesystem name space is not supported. + * However, VSOCK nodes in namefs are supported in order + * to make fattach work for sockets. + * + * XXX This uses VOP_REALVP to distinguish between + * an unopened namefs node (where VOP_REALVP returns a + * different VSOCK vnode) and a VSOCK created by vn_create + * in some file system (where VOP_REALVP would never return + * a different vnode). + */ + if (vp->v_type == VSOCK) { + struct vnode *nvp; + + error = VOP_REALVP(vp, &nvp, NULL); + if (error != 0 || nvp == NULL || nvp == vp || + nvp->v_type != VSOCK) { + error = EOPNOTSUPP; + goto out; + } + } + + if ((vp->v_type == VREG) && nbl_need_check(vp)) { + /* get share reservation */ + shr.s_access = 0; + if (filemode & FWRITE) + shr.s_access |= F_WRACC; + if (filemode & FREAD) + shr.s_access |= F_RDACC; + shr.s_deny = 0; + shr.s_sysid = 0; + shr.s_pid = ttoproc(curthread)->p_pid; + shr_own.sl_pid = shr.s_pid; + shr_own.sl_id = fd; + shr.s_own_len = sizeof (shr_own); + shr.s_owner = (caddr_t)&shr_own; + error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(), + NULL); + if (error) + goto out; + shrlock_done = 1; + + /* nbmand conflict check if truncating file */ + if ((filemode & FTRUNC) && !(filemode & FCREAT)) { + nbl_start_crit(vp, RW_READER); + in_crit = 1; + + vattr.va_mask = AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) + goto out; + if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0, + NULL)) { + error = EACCES; + goto out; + } + } + } + + /* + * Do opening protocol. + */ + error = VOP_OPEN(&vp, filemode, CRED(), NULL); + if (error) + goto out; + open_done = 1; + + /* + * Truncate if required. + */ + if ((filemode & FTRUNC) && !(filemode & FCREAT)) { + vattr.va_size = 0; + vattr.va_mask = AT_SIZE; + if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto out; + } +out: + ASSERT(vp->v_count > 0); + + if (in_crit) { + nbl_end_crit(vp); + in_crit = 0; + } + if (error) { + if (open_done) { + (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(), + NULL); + open_done = 0; + shrlock_done = 0; + } + if (shrlock_done) { + (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(), + NULL); + shrlock_done = 0; + } + + /* + * The following clause was added to handle a problem + * with NFS consistency. It is possible that a lookup + * of the file to be opened succeeded, but the file + * itself doesn't actually exist on the server. This + * is chiefly due to the DNLC containing an entry for + * the file which has been removed on the server. In + * this case, we just start over. If there was some + * other cause for the ESTALE error, then the lookup + * of the file will fail and the error will be returned + * above instead of looping around from here. + */ + VN_RELE(vp); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + } else + *vpp = vp; + return (error); +} + +/* + * The following two accessor functions are for the NFSv4 server. Since there + * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the + * vnode open counts correct when a client "upgrades" an open or does an + * open_downgrade. In NFS, an upgrade or downgrade can not only change the + * open mode (add or subtract read or write), but also change the share/deny + * modes. However, share reservations are not integrated with OPEN, yet, so + * we need to handle each separately. These functions are cleaner than having + * the NFS server manipulate the counts directly, however, nobody else should + * use these functions. + */ +void +vn_open_upgrade( + vnode_t *vp, + int filemode) +{ + ASSERT(vp->v_type == VREG); + + if (filemode & FREAD) + atomic_add_32(&(vp->v_rdcnt), 1); + if (filemode & FWRITE) + atomic_add_32(&(vp->v_wrcnt), 1); + +} + +void +vn_open_downgrade( + vnode_t *vp, + int filemode) +{ + ASSERT(vp->v_type == VREG); + + if (filemode & FREAD) { + ASSERT(vp->v_rdcnt > 0); + atomic_add_32(&(vp->v_rdcnt), -1); + } + if (filemode & FWRITE) { + ASSERT(vp->v_wrcnt > 0); + atomic_add_32(&(vp->v_wrcnt), -1); + } + +} + +int +vn_create( + char *pnamep, + enum uio_seg seg, + struct vattr *vap, + enum vcexcl excl, + int mode, + struct vnode **vpp, + enum create why, + int flag, + mode_t umask) +{ + return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag, + umask, NULL)); +} + +/* + * Create a vnode (makenode). + */ +int +vn_createat( + char *pnamep, + enum uio_seg seg, + struct vattr *vap, + enum vcexcl excl, + int mode, + struct vnode **vpp, + enum create why, + int flag, + mode_t umask, + struct vnode *startvp) +{ + struct vnode *dvp; /* ptr to parent dir vnode */ + struct vnode *vp = NULL; + struct pathname pn; + int error; + int in_crit = 0; + struct vattr vattr; + enum symfollow follow; + int estale_retry = 0; + uint32_t auditing = AU_AUDITING(); + + ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); + + /* symlink interpretation */ + if ((flag & FNOFOLLOW) || excl == EXCL) + follow = NO_FOLLOW; + else + follow = FOLLOW; + flag &= ~(FNOFOLLOW|FNOLINKS); + +top: + /* + * Lookup directory. + * If new object is a file, call lower level to create it. + * Note that it is up to the lower level to enforce exclusive + * creation, if the file is already there. + * This allows the lower level to do whatever + * locking or protocol that is needed to prevent races. + * If the new object is directory call lower level to make + * the new directory, with "." and "..". + */ + if (error = pn_get(pnamep, seg, &pn)) + return (error); + if (auditing) + audit_vncreate_start(); + dvp = NULL; + *vpp = NULL; + /* + * lookup will find the parent directory for the vnode. + * When it is done the pn holds the name of the entry + * in the directory. + * If this is a non-exclusive create we also find the node itself. + */ + error = lookuppnat(&pn, NULL, follow, &dvp, + (excl == EXCL) ? NULLVPP : vpp, startvp); + if (error) { + pn_free(&pn); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + if (why == CRMKDIR && error == EINVAL) + error = EEXIST; /* SVID */ + return (error); + } + + if (why != CRMKNOD) + vap->va_mode &= ~VSVTX; + + /* + * If default ACLs are defined for the directory don't apply the + * umask if umask is passed. + */ + + if (umask) { + + vsecattr_t vsec; + + vsec.vsa_aclcnt = 0; + vsec.vsa_aclentp = NULL; + vsec.vsa_dfaclcnt = 0; + vsec.vsa_dfaclentp = NULL; + vsec.vsa_mask = VSA_DFACLCNT; + error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL); + /* + * If error is ENOSYS then treat it as no error + * Don't want to force all file systems to support + * aclent_t style of ACL's. + */ + if (error == ENOSYS) + error = 0; + if (error) { + if (*vpp != NULL) + VN_RELE(*vpp); + goto out; + } else { + /* + * Apply the umask if no default ACLs. + */ + if (vsec.vsa_dfaclcnt == 0) + vap->va_mode &= ~umask; + + /* + * VOP_GETSECATTR() may have allocated memory for + * ACLs we didn't request, so double-check and + * free it if necessary. + */ + if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) + kmem_free((caddr_t)vsec.vsa_aclentp, + vsec.vsa_aclcnt * sizeof (aclent_t)); + if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) + kmem_free((caddr_t)vsec.vsa_dfaclentp, + vsec.vsa_dfaclcnt * sizeof (aclent_t)); + } + } + + /* + * In general we want to generate EROFS if the file system is + * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1 + * documents the open system call, and it says that O_CREAT has no + * effect if the file already exists. Bug 1119649 states + * that open(path, O_CREAT, ...) fails when attempting to open an + * existing file on a read only file system. Thus, the first part + * of the following if statement has 3 checks: + * if the file exists && + * it is being open with write access && + * the file system is read only + * then generate EROFS + */ + if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) || + (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { + if (*vpp) + VN_RELE(*vpp); + error = EROFS; + } else if (excl == NONEXCL && *vpp != NULL) { + vnode_t *rvp; + + /* + * File already exists. If a mandatory lock has been + * applied, return error. + */ + vp = *vpp; + if (VOP_REALVP(vp, &rvp, NULL) != 0) + rvp = vp; + if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) { + nbl_start_crit(vp, RW_READER); + in_crit = 1; + } + if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) { + vattr.va_mask = AT_MODE|AT_SIZE; + if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) { + goto out; + } + if (MANDLOCK(vp, vattr.va_mode)) { + error = EAGAIN; + goto out; + } + /* + * File cannot be truncated if non-blocking mandatory + * locks are currently on the file. + */ + if ((vap->va_mask & AT_SIZE) && in_crit) { + u_offset_t offset; + ssize_t length; + + offset = vap->va_size > vattr.va_size ? + vattr.va_size : vap->va_size; + length = vap->va_size > vattr.va_size ? + vap->va_size - vattr.va_size : + vattr.va_size - vap->va_size; + if (nbl_conflict(vp, NBL_WRITE, offset, + length, 0, NULL)) { + error = EACCES; + goto out; + } + } + } + + /* + * If the file is the root of a VFS, we've crossed a + * mount point and the "containing" directory that we + * acquired above (dvp) is irrelevant because it's in + * a different file system. We apply VOP_CREATE to the + * target itself instead of to the containing directory + * and supply a null path name to indicate (conventionally) + * the node itself as the "component" of interest. + * + * The intercession of the file system is necessary to + * ensure that the appropriate permission checks are + * done. + */ + if (vp->v_flag & VROOT) { + ASSERT(why != CRMKDIR); + error = VOP_CREATE(vp, "", vap, excl, mode, vpp, + CRED(), flag, NULL, NULL); + /* + * If the create succeeded, it will have created + * a new reference to the vnode. Give up the + * original reference. The assertion should not + * get triggered because NBMAND locks only apply to + * VREG files. And if in_crit is non-zero for some + * reason, detect that here, rather than when we + * deference a null vp. + */ + ASSERT(in_crit == 0); + VN_RELE(vp); + vp = NULL; + goto out; + } + + /* + * Large File API - non-large open (FOFFMAX flag not set) + * of regular file fails if the file size exceeds MAXOFF32_T. + */ + if (why != CRMKDIR && + !(flag & FOFFMAX) && + (vp->v_type == VREG)) { + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, + CRED(), NULL))) { + goto out; + } + if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) { + error = EOVERFLOW; + goto out; + } + } + } + + if (error == 0) { + /* + * Call mkdir() if specified, otherwise create(). + */ + int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */ + + if (why == CRMKDIR) + /* + * N.B., if vn_createat() ever requests + * case-insensitive behavior then it will need + * to be passed to VOP_MKDIR(). VOP_CREATE() + * will already get it via "flag" + */ + error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(), + NULL, 0, NULL); + else if (!must_be_dir) + error = VOP_CREATE(dvp, pn.pn_path, vap, + excl, mode, vpp, CRED(), flag, NULL, NULL); + else + error = ENOTDIR; + } + +out: + + if (auditing) + audit_vncreate_finish(*vpp, error); + if (in_crit) { + nbl_end_crit(vp); + in_crit = 0; + } + if (vp != NULL) { + VN_RELE(vp); + vp = NULL; + } + pn_free(&pn); + VN_RELE(dvp); + /* + * The following clause was added to handle a problem + * with NFS consistency. It is possible that a lookup + * of the file to be created succeeded, but the file + * itself doesn't actually exist on the server. This + * is chiefly due to the DNLC containing an entry for + * the file which has been removed on the server. In + * this case, we just start over. If there was some + * other cause for the ESTALE error, then the lookup + * of the file will fail and the error will be returned + * above instead of looping around from here. + */ + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + return (error); +} + +int +vn_link(char *from, char *to, enum uio_seg seg) +{ + return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg)); +} + +int +vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, + vnode_t *tstartvp, char *to, enum uio_seg seg) +{ + struct vnode *fvp; /* from vnode ptr */ + struct vnode *tdvp; /* to directory vnode ptr */ + struct pathname pn; + int error; + struct vattr vattr; + dev_t fsid; + int estale_retry = 0; + uint32_t auditing = AU_AUDITING(); + +top: + fvp = tdvp = NULL; + if (error = pn_get(to, seg, &pn)) + return (error); + if (auditing && fstartvp != NULL) + audit_setfsat_path(1); + if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp)) + goto out; + if (auditing && tstartvp != NULL) + audit_setfsat_path(3); + if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp)) + goto out; + /* + * Make sure both source vnode and target directory vnode are + * in the same vfs and that it is writeable. + */ + vattr.va_mask = AT_FSID; + if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL)) + goto out; + fsid = vattr.va_fsid; + vattr.va_mask = AT_FSID; + if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL)) + goto out; + if (fsid != vattr.va_fsid) { + error = EXDEV; + goto out; + } + if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { + error = EROFS; + goto out; + } + /* + * Do the link. + */ + (void) pn_fixslash(&pn); + error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0); +out: + pn_free(&pn); + if (fvp) + VN_RELE(fvp); + if (tdvp) + VN_RELE(tdvp); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + return (error); +} + +int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + return (vn_renameat(NULL, from, NULL, to, seg)); +} + +int +vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, + char *tname, enum uio_seg seg) +{ + int error; + struct vattr vattr; + struct pathname fpn; /* from pathname */ + struct pathname tpn; /* to pathname */ + dev_t fsid; + int in_crit_src, in_crit_targ; + vnode_t *fromvp, *fvp; + vnode_t *tovp, *targvp; + int estale_retry = 0; + uint32_t auditing = AU_AUDITING(); + +top: + fvp = fromvp = tovp = targvp = NULL; + in_crit_src = in_crit_targ = 0; + /* + * Get to and from pathnames. + */ + if (error = pn_get(fname, seg, &fpn)) + return (error); + if (error = pn_get(tname, seg, &tpn)) { + pn_free(&fpn); + return (error); + } + + /* + * First we need to resolve the correct directories + * The passed in directories may only be a starting point, + * but we need the real directories the file(s) live in. + * For example the fname may be something like usr/lib/sparc + * and we were passed in the / directory, but we need to + * use the lib directory for the rename. + */ + + if (auditing && fdvp != NULL) + audit_setfsat_path(1); + /* + * Lookup to and from directories. + */ + if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) { + goto out; + } + + /* + * Make sure there is an entry. + */ + if (fvp == NULL) { + error = ENOENT; + goto out; + } + + if (auditing && tdvp != NULL) + audit_setfsat_path(3); + if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) { + goto out; + } + + /* + * Make sure both the from vnode directory and the to directory + * are in the same vfs and the to directory is writable. + * We check fsid's, not vfs pointers, so loopback fs works. + */ + if (fromvp != tovp) { + vattr.va_mask = AT_FSID; + if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL)) + goto out; + fsid = vattr.va_fsid; + vattr.va_mask = AT_FSID; + if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL)) + goto out; + if (fsid != vattr.va_fsid) { + error = EXDEV; + goto out; + } + } + + if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) { + error = EROFS; + goto out; + } + + if (targvp && (fvp != targvp)) { + nbl_start_crit(targvp, RW_READER); + in_crit_targ = 1; + if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { + error = EACCES; + goto out; + } + } + + if (nbl_need_check(fvp)) { + nbl_start_crit(fvp, RW_READER); + in_crit_src = 1; + if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) { + error = EACCES; + goto out; + } + } + + /* + * Do the rename. + */ + (void) pn_fixslash(&tpn); + error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(), + NULL, 0); + +out: + pn_free(&fpn); + pn_free(&tpn); + if (in_crit_src) + nbl_end_crit(fvp); + if (in_crit_targ) + nbl_end_crit(targvp); + if (fromvp) + VN_RELE(fromvp); + if (tovp) + VN_RELE(tovp); + if (targvp) + VN_RELE(targvp); + if (fvp) + VN_RELE(fvp); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + return (error); +} + +/* + * Remove a file or directory. + */ +int +vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) +{ + return (vn_removeat(NULL, fnamep, seg, dirflag)); +} + +int +vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag) +{ + struct vnode *vp; /* entry vnode */ + struct vnode *dvp; /* ptr to parent dir vnode */ + struct vnode *coveredvp; + struct pathname pn; /* name of entry */ + enum vtype vtype; + int error; + struct vfs *vfsp; + struct vfs *dvfsp; /* ptr to parent dir vfs */ + int in_crit = 0; + int estale_retry = 0; + +top: + if (error = pn_get(fnamep, seg, &pn)) + return (error); + dvp = vp = NULL; + if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) { + pn_free(&pn); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + return (error); + } + + /* + * Make sure there is an entry. + */ + if (vp == NULL) { + error = ENOENT; + goto out; + } + + vfsp = vp->v_vfsp; + dvfsp = dvp->v_vfsp; + + /* + * If the named file is the root of a mounted filesystem, fail, + * unless it's marked unlinkable. In that case, unmount the + * filesystem and proceed to unlink the covered vnode. (If the + * covered vnode is a directory, use rmdir instead of unlink, + * to avoid file system corruption.) + */ + if (vp->v_flag & VROOT) { + if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) { + error = EBUSY; + goto out; + } + + /* + * Namefs specific code starts here. + */ + + if (dirflag == RMDIRECTORY) { + /* + * User called rmdir(2) on a file that has + * been namefs mounted on top of. Since + * namefs doesn't allow directories to + * be mounted on other files we know + * vp is not of type VDIR so fail to operation. + */ + error = ENOTDIR; + goto out; + } + + /* + * If VROOT is still set after grabbing vp->v_lock, + * noone has finished nm_unmount so far and coveredvp + * is valid. + * If we manage to grab vn_vfswlock(coveredvp) before releasing + * vp->v_lock, any race window is eliminated. + */ + + mutex_enter(&vp->v_lock); + if ((vp->v_flag & VROOT) == 0) { + /* Someone beat us to the unmount */ + mutex_exit(&vp->v_lock); + error = EBUSY; + goto out; + } + vfsp = vp->v_vfsp; + coveredvp = vfsp->vfs_vnodecovered; + ASSERT(coveredvp); + /* + * Note: Implementation of vn_vfswlock shows that ordering of + * v_lock / vn_vfswlock is not an issue here. + */ + error = vn_vfswlock(coveredvp); + mutex_exit(&vp->v_lock); + + if (error) + goto out; + + VN_HOLD(coveredvp); + VN_RELE(vp); + error = dounmount(vfsp, 0, CRED()); + + /* + * Unmounted the namefs file system; now get + * the object it was mounted over. + */ + vp = coveredvp; + /* + * If namefs was mounted over a directory, then + * we want to use rmdir() instead of unlink(). + */ + if (vp->v_type == VDIR) + dirflag = RMDIRECTORY; + + if (error) + goto out; + } + + /* + * Make sure filesystem is writeable. + * We check the parent directory's vfs in case this is an lofs vnode. + */ + if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) { + error = EROFS; + goto out; + } + + vtype = vp->v_type; + + /* + * If there is the possibility of an nbmand share reservation, make + * sure it's okay to remove the file. Keep a reference to the + * vnode, so that we can exit the nbl critical region after + * calling VOP_REMOVE. + * If there is no possibility of an nbmand share reservation, + * release the vnode reference now. Filesystems like NFS may + * behave differently if there is an extra reference, so get rid of + * this one. Fortunately, we can't have nbmand mounts on NFS + * filesystems. + */ + if (nbl_need_check(vp)) { + nbl_start_crit(vp, RW_READER); + in_crit = 1; + if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) { + error = EACCES; + goto out; + } + } else { + VN_RELE(vp); + vp = NULL; + } + + if (dirflag == RMDIRECTORY) { + /* + * Caller is using rmdir(2), which can only be applied to + * directories. + */ + if (vtype != VDIR) { + error = ENOTDIR; + } else { + vnode_t *cwd; + proc_t *pp = curproc; + + mutex_enter(&pp->p_lock); + cwd = PTOU(pp)->u_cdir; + VN_HOLD(cwd); + mutex_exit(&pp->p_lock); + error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(), + NULL, 0); + VN_RELE(cwd); + } + } else { + /* + * Unlink(2) can be applied to anything. + */ + error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0); + } + +out: + pn_free(&pn); + if (in_crit) { + nbl_end_crit(vp); + in_crit = 0; + } + if (vp != NULL) + VN_RELE(vp); + if (dvp != NULL) + VN_RELE(dvp); + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto top; + return (error); +} + +/* + * Utility function to compare equality of vnodes. + * Compare the underlying real vnodes, if there are underlying vnodes. + * This is a more thorough comparison than the VN_CMP() macro provides. + */ +int +vn_compare(vnode_t *vp1, vnode_t *vp2) +{ + vnode_t *realvp; + + if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0) + vp1 = realvp; + if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0) + vp2 = realvp; + return (VN_CMP(vp1, vp2)); +} + +/* + * The number of locks to hash into. This value must be a power + * of 2 minus 1 and should probably also be prime. + */ +#define NUM_BUCKETS 1023 + +struct vn_vfslocks_bucket { + kmutex_t vb_lock; + vn_vfslocks_entry_t *vb_list; + char pad[64 - sizeof (kmutex_t) - sizeof (void *)]; +}; + +/* + * Total number of buckets will be NUM_BUCKETS + 1 . + */ + +#pragma align 64(vn_vfslocks_buckets) +static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1]; + +#define VN_VFSLOCKS_SHIFT 9 + +#define VN_VFSLOCKS_HASH(vfsvpptr) \ + ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS) + +/* + * vn_vfslocks_getlock() uses an HASH scheme to generate + * rwstlock using vfs/vnode pointer passed to it. + * + * vn_vfslocks_rele() releases a reference in the + * HASH table which allows the entry allocated by + * vn_vfslocks_getlock() to be freed at a later + * stage when the refcount drops to zero. + */ + +vn_vfslocks_entry_t * +vn_vfslocks_getlock(void *vfsvpptr) +{ + struct vn_vfslocks_bucket *bp; + vn_vfslocks_entry_t *vep; + vn_vfslocks_entry_t *tvep; + + ASSERT(vfsvpptr != NULL); + bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)]; + + mutex_enter(&bp->vb_lock); + for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { + if (vep->ve_vpvfs == vfsvpptr) { + vep->ve_refcnt++; + mutex_exit(&bp->vb_lock); + return (vep); + } + } + mutex_exit(&bp->vb_lock); + vep = kmem_alloc(sizeof (*vep), KM_SLEEP); + rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL); + vep->ve_vpvfs = (char *)vfsvpptr; + vep->ve_refcnt = 1; + mutex_enter(&bp->vb_lock); + for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) { + if (tvep->ve_vpvfs == vfsvpptr) { + tvep->ve_refcnt++; + mutex_exit(&bp->vb_lock); + + /* + * There is already an entry in the hash + * destroy what we just allocated. + */ + rwst_destroy(&vep->ve_lock); + kmem_free(vep, sizeof (*vep)); + return (tvep); + } + } + vep->ve_next = bp->vb_list; + bp->vb_list = vep; + mutex_exit(&bp->vb_lock); + return (vep); +} + +void +vn_vfslocks_rele(vn_vfslocks_entry_t *vepent) +{ + struct vn_vfslocks_bucket *bp; + vn_vfslocks_entry_t *vep; + vn_vfslocks_entry_t *pvep; + + ASSERT(vepent != NULL); + ASSERT(vepent->ve_vpvfs != NULL); + + bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)]; + + mutex_enter(&bp->vb_lock); + vepent->ve_refcnt--; + + if ((int32_t)vepent->ve_refcnt < 0) + cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative"); + + if (vepent->ve_refcnt == 0) { + for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { + if (vep->ve_vpvfs == vepent->ve_vpvfs) { + if (bp->vb_list == vep) + bp->vb_list = vep->ve_next; + else { + /* LINTED */ + pvep->ve_next = vep->ve_next; + } + mutex_exit(&bp->vb_lock); + rwst_destroy(&vep->ve_lock); + kmem_free(vep, sizeof (*vep)); + return; + } + pvep = vep; + } + cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found"); + } + mutex_exit(&bp->vb_lock); +} + +/* + * vn_vfswlock_wait is used to implement a lock which is logically a writers + * lock protecting the v_vfsmountedhere field. + * vn_vfswlock_wait has been modified to be similar to vn_vfswlock, + * except that it blocks to acquire the lock VVFSLOCK. + * + * traverse() and routines re-implementing part of traverse (e.g. autofs) + * need to hold this lock. mount(), vn_rename(), vn_remove() and so on + * need the non-blocking version of the writers lock i.e. vn_vfswlock + */ +int +vn_vfswlock_wait(vnode_t *vp) +{ + int retval; + vn_vfslocks_entry_t *vpvfsentry; + ASSERT(vp != NULL); + + vpvfsentry = vn_vfslocks_getlock(vp); + retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER); + + if (retval == EINTR) { + vn_vfslocks_rele(vpvfsentry); + return (EINTR); + } + return (retval); +} + +int +vn_vfsrlock_wait(vnode_t *vp) +{ + int retval; + vn_vfslocks_entry_t *vpvfsentry; + ASSERT(vp != NULL); + + vpvfsentry = vn_vfslocks_getlock(vp); + retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER); + + if (retval == EINTR) { + vn_vfslocks_rele(vpvfsentry); + return (EINTR); + } + + return (retval); +} + + +/* + * vn_vfswlock is used to implement a lock which is logically a writers lock + * protecting the v_vfsmountedhere field. + */ +int +vn_vfswlock(vnode_t *vp) +{ + vn_vfslocks_entry_t *vpvfsentry; + + /* + * If vp is NULL then somebody is trying to lock the covered vnode + * of /. (vfs_vnodecovered is NULL for /). This situation will + * only happen when unmounting /. Since that operation will fail + * anyway, return EBUSY here instead of in VFS_UNMOUNT. + */ + if (vp == NULL) + return (EBUSY); + + vpvfsentry = vn_vfslocks_getlock(vp); + + if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) + return (0); + + vn_vfslocks_rele(vpvfsentry); + return (EBUSY); +} + +int +vn_vfsrlock(vnode_t *vp) +{ + vn_vfslocks_entry_t *vpvfsentry; + + /* + * If vp is NULL then somebody is trying to lock the covered vnode + * of /. (vfs_vnodecovered is NULL for /). This situation will + * only happen when unmounting /. Since that operation will fail + * anyway, return EBUSY here instead of in VFS_UNMOUNT. + */ + if (vp == NULL) + return (EBUSY); + + vpvfsentry = vn_vfslocks_getlock(vp); + + if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) + return (0); + + vn_vfslocks_rele(vpvfsentry); + return (EBUSY); +} + +void +vn_vfsunlock(vnode_t *vp) +{ + vn_vfslocks_entry_t *vpvfsentry; + + /* + * ve_refcnt needs to be decremented twice. + * 1. To release refernce after a call to vn_vfslocks_getlock() + * 2. To release the reference from the locking routines like + * vn_vfsrlock/vn_vfswlock etc,. + */ + vpvfsentry = vn_vfslocks_getlock(vp); + vn_vfslocks_rele(vpvfsentry); + + rwst_exit(&vpvfsentry->ve_lock); + vn_vfslocks_rele(vpvfsentry); +} + +int +vn_vfswlock_held(vnode_t *vp) +{ + int held; + vn_vfslocks_entry_t *vpvfsentry; + + ASSERT(vp != NULL); + + vpvfsentry = vn_vfslocks_getlock(vp); + held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); + + vn_vfslocks_rele(vpvfsentry); + return (held); +} + + +int +vn_make_ops( + const char *name, /* Name of file system */ + const fs_operation_def_t *templ, /* Operation specification */ + vnodeops_t **actual) /* Return the vnodeops */ +{ + int unused_ops; + int error; + + *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP); + + (*actual)->vnop_name = name; + + error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ); + if (error) { + kmem_free(*actual, sizeof (vnodeops_t)); + } + +#if DEBUG + if (unused_ops != 0) + cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied " + "but not used", name, unused_ops); +#endif + + return (error); +} + +/* + * Free the vnodeops created as a result of vn_make_ops() + */ +void +vn_freevnodeops(vnodeops_t *vnops) +{ + kmem_free(vnops, sizeof (vnodeops_t)); +} + +/* + * Vnode cache. + */ + +/* ARGSUSED */ +static int +vn_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct vnode *vp; + + vp = buf; + + mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); + rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); + vp->v_femhead = NULL; /* Must be done before vn_reinit() */ + vp->v_path = NULL; + vp->v_mpssdata = NULL; + vp->v_vsd = NULL; + vp->v_fopdata = NULL; + + return (0); +} + +/* ARGSUSED */ +static void +vn_cache_destructor(void *buf, void *cdrarg) +{ + struct vnode *vp; + + vp = buf; + + rw_destroy(&vp->v_nbllock); + cv_destroy(&vp->v_cv); + mutex_destroy(&vp->v_vsd_lock); + mutex_destroy(&vp->v_lock); +} + +void +vn_create_cache(void) +{ + /* LINTED */ + ASSERT((1 << VNODE_ALIGN_LOG2) == + P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN)); + vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), + VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL, + NULL, 0); +} + +void +vn_destroy_cache(void) +{ + kmem_cache_destroy(vn_cache); +} + +/* + * Used by file systems when fs-specific nodes (e.g., ufs inodes) are + * cached by the file system and vnodes remain associated. + */ +void +vn_recycle(vnode_t *vp) +{ + ASSERT(vp->v_pages == NULL); + + /* + * XXX - This really belongs in vn_reinit(), but we have some issues + * with the counts. Best to have it here for clean initialization. + */ + vp->v_rdcnt = 0; + vp->v_wrcnt = 0; + vp->v_mmap_read = 0; + vp->v_mmap_write = 0; + + /* + * If FEM was in use, make sure everything gets cleaned up + * NOTE: vp->v_femhead is initialized to NULL in the vnode + * constructor. + */ + if (vp->v_femhead) { + /* XXX - There should be a free_femhead() that does all this */ + ASSERT(vp->v_femhead->femh_list == NULL); + mutex_destroy(&vp->v_femhead->femh_lock); + kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); + vp->v_femhead = NULL; + } + if (vp->v_path) { + kmem_free(vp->v_path, strlen(vp->v_path) + 1); + vp->v_path = NULL; + } + + if (vp->v_fopdata != NULL) { + free_fopdata(vp); + } + vp->v_mpssdata = NULL; + vsd_free(vp); +} + +/* + * Used to reset the vnode fields including those that are directly accessible + * as well as those which require an accessor function. + * + * Does not initialize: + * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv + * v_data (since FS-nodes and vnodes point to each other and should + * be updated simultaneously) + * v_op (in case someone needs to make a VOP call on this object) + */ +void +vn_reinit(vnode_t *vp) +{ + vp->v_count = 1; + vp->v_count_dnlc = 0; + vp->v_vfsp = NULL; + vp->v_stream = NULL; + vp->v_vfsmountedhere = NULL; + vp->v_flag = 0; + vp->v_type = VNON; + vp->v_rdev = NODEV; + + vp->v_filocks = NULL; + vp->v_shrlocks = NULL; + vp->v_pages = NULL; + + vp->v_locality = NULL; + vp->v_xattrdir = NULL; + + /* Handles v_femhead, v_path, and the r/w/map counts */ + vn_recycle(vp); +} + +vnode_t * +vn_alloc(int kmflag) +{ + vnode_t *vp; + + vp = kmem_cache_alloc(vn_cache, kmflag); + + if (vp != NULL) { + vp->v_femhead = NULL; /* Must be done before vn_reinit() */ + vp->v_fopdata = NULL; + vn_reinit(vp); + } + + return (vp); +} + +void +vn_free(vnode_t *vp) +{ + ASSERT(vp->v_shrlocks == NULL); + ASSERT(vp->v_filocks == NULL); + + /* + * Some file systems call vn_free() with v_count of zero, + * some with v_count of 1. In any case, the value should + * never be anything else. + */ + ASSERT((vp->v_count == 0) || (vp->v_count == 1)); + ASSERT(vp->v_count_dnlc == 0); + if (vp->v_path != NULL) { + kmem_free(vp->v_path, strlen(vp->v_path) + 1); + vp->v_path = NULL; + } + + /* If FEM was in use, make sure everything gets cleaned up */ + if (vp->v_femhead) { + /* XXX - There should be a free_femhead() that does all this */ + ASSERT(vp->v_femhead->femh_list == NULL); + mutex_destroy(&vp->v_femhead->femh_lock); + kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); + vp->v_femhead = NULL; + } + + if (vp->v_fopdata != NULL) { + free_fopdata(vp); + } + vp->v_mpssdata = NULL; + vsd_free(vp); + kmem_cache_free(vn_cache, vp); +} + +/* + * vnode status changes, should define better states than 1, 0. + */ +void +vn_reclaim(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + + if (vfsp == NULL || + vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { + return; + } + (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED); +} + +void +vn_idle(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + + if (vfsp == NULL || + vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { + return; + } + (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED); +} +void +vn_exists(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + + if (vfsp == NULL || + vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { + return; + } + (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS); +} + +void +vn_invalid(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + + if (vfsp == NULL || + vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { + return; + } + (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED); +} + +/* Vnode event notification */ + +int +vnevent_support(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL) + return (EINVAL); + + return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct)); +} + +void +vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); +} + +void +vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct); +} + +void +vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); +} + +void +vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct); +} + +void +vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct); +} + +void +vnevent_create(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct); +} + +void +vnevent_link(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct); +} + +void +vnevent_mountedover(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct); +} + +/* + * Vnode accessors. + */ + +int +vn_is_readonly(vnode_t *vp) +{ + return (vp->v_vfsp->vfs_flag & VFS_RDONLY); +} + +int +vn_has_flocks(vnode_t *vp) +{ + return (vp->v_filocks != NULL); +} + +int +vn_has_mandatory_locks(vnode_t *vp, int mode) +{ + return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode))); +} + +int +vn_has_cached_data(vnode_t *vp) +{ + return (vp->v_pages != NULL); +} + +/* + * Return 0 if the vnode in question shouldn't be permitted into a zone via + * zone_enter(2). + */ +int +vn_can_change_zones(vnode_t *vp) +{ + struct vfssw *vswp; + int allow = 1; + vnode_t *rvp; + + if (nfs_global_client_only != 0) + return (1); + + /* + * We always want to look at the underlying vnode if there is one. + */ + if (VOP_REALVP(vp, &rvp, NULL) != 0) + rvp = vp; + /* + * Some pseudo filesystems (including doorfs) don't actually register + * their vfsops_t, so the following may return NULL; we happily let + * such vnodes switch zones. + */ + vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp)); + if (vswp != NULL) { + if (vswp->vsw_flag & VSW_NOTZONESAFE) + allow = 0; + vfs_unrefvfssw(vswp); + } + return (allow); +} + +/* + * Return nonzero if the vnode is a mount point, zero if not. + */ +int +vn_ismntpt(vnode_t *vp) +{ + return (vp->v_vfsmountedhere != NULL); +} + +/* Retrieve the vfs (if any) mounted on this vnode */ +vfs_t * +vn_mountedvfs(vnode_t *vp) +{ + return (vp->v_vfsmountedhere); +} + +/* + * Return nonzero if the vnode is referenced by the dnlc, zero if not. + */ +int +vn_in_dnlc(vnode_t *vp) +{ + return (vp->v_count_dnlc > 0); +} + +/* + * vn_has_other_opens() checks whether a particular file is opened by more than + * just the caller and whether the open is for read and/or write. + * This routine is for calling after the caller has already called VOP_OPEN() + * and the caller wishes to know if they are the only one with it open for + * the mode(s) specified. + * + * Vnode counts are only kept on regular files (v_type=VREG). + */ +int +vn_has_other_opens( + vnode_t *vp, + v_mode_t mode) +{ + + ASSERT(vp != NULL); + + switch (mode) { + case V_WRITE: + if (vp->v_wrcnt > 1) + return (V_TRUE); + break; + case V_RDORWR: + if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1)) + return (V_TRUE); + break; + case V_RDANDWR: + if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1)) + return (V_TRUE); + break; + case V_READ: + if (vp->v_rdcnt > 1) + return (V_TRUE); + break; + } + + return (V_FALSE); +} + +/* + * vn_is_opened() checks whether a particular file is opened and + * whether the open is for read and/or write. + * + * Vnode counts are only kept on regular files (v_type=VREG). + */ +int +vn_is_opened( + vnode_t *vp, + v_mode_t mode) +{ + + ASSERT(vp != NULL); + + switch (mode) { + case V_WRITE: + if (vp->v_wrcnt) + return (V_TRUE); + break; + case V_RDANDWR: + if (vp->v_rdcnt && vp->v_wrcnt) + return (V_TRUE); + break; + case V_RDORWR: + if (vp->v_rdcnt || vp->v_wrcnt) + return (V_TRUE); + break; + case V_READ: + if (vp->v_rdcnt) + return (V_TRUE); + break; + } + + return (V_FALSE); +} + +/* + * vn_is_mapped() checks whether a particular file is mapped and whether + * the file is mapped read and/or write. + */ +int +vn_is_mapped( + vnode_t *vp, + v_mode_t mode) +{ + + ASSERT(vp != NULL); + +#if !defined(_LP64) + switch (mode) { + /* + * The atomic_add_64_nv functions force atomicity in the + * case of 32 bit architectures. Otherwise the 64 bit values + * require two fetches. The value of the fields may be + * (potentially) changed between the first fetch and the + * second + */ + case V_WRITE: + if (atomic_add_64_nv((&(vp->v_mmap_write)), 0)) + return (V_TRUE); + break; + case V_RDANDWR: + if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) && + (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) + return (V_TRUE); + break; + case V_RDORWR: + if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) || + (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) + return (V_TRUE); + break; + case V_READ: + if (atomic_add_64_nv((&(vp->v_mmap_read)), 0)) + return (V_TRUE); + break; + } +#else + switch (mode) { + case V_WRITE: + if (vp->v_mmap_write) + return (V_TRUE); + break; + case V_RDANDWR: + if (vp->v_mmap_read && vp->v_mmap_write) + return (V_TRUE); + break; + case V_RDORWR: + if (vp->v_mmap_read || vp->v_mmap_write) + return (V_TRUE); + break; + case V_READ: + if (vp->v_mmap_read) + return (V_TRUE); + break; + } +#endif + + return (V_FALSE); +} + +/* + * Set the operations vector for a vnode. + * + * FEM ensures that the v_femhead pointer is filled in before the + * v_op pointer is changed. This means that if the v_femhead pointer + * is NULL, and the v_op field hasn't changed since before which checked + * the v_femhead pointer; then our update is ok - we are not racing with + * FEM. + */ +void +vn_setops(vnode_t *vp, vnodeops_t *vnodeops) +{ + vnodeops_t *op; + + ASSERT(vp != NULL); + ASSERT(vnodeops != NULL); + + op = vp->v_op; + membar_consumer(); + /* + * If vp->v_femhead == NULL, then we'll call casptr() to do the + * compare-and-swap on vp->v_op. If either fails, then FEM is + * in effect on the vnode and we need to have FEM deal with it. + */ + if (vp->v_femhead != NULL || casptr(&vp->v_op, op, vnodeops) != op) { + fem_setvnops(vp, vnodeops); + } +} + +/* + * Retrieve the operations vector for a vnode + * As with vn_setops(above); make sure we aren't racing with FEM. + * FEM sets the v_op to a special, internal, vnodeops that wouldn't + * make sense to the callers of this routine. + */ +vnodeops_t * +vn_getops(vnode_t *vp) +{ + vnodeops_t *op; + + ASSERT(vp != NULL); + + op = vp->v_op; + membar_consumer(); + if (vp->v_femhead == NULL && op == vp->v_op) { + return (op); + } else { + return (fem_getvnops(vp)); + } +} + +/* + * Returns non-zero (1) if the vnodeops matches that of the vnode. + * Returns zero (0) if not. + */ +int +vn_matchops(vnode_t *vp, vnodeops_t *vnodeops) +{ + return (vn_getops(vp) == vnodeops); +} + +/* + * Returns non-zero (1) if the specified operation matches the + * corresponding operation for that the vnode. + * Returns zero (0) if not. + */ + +#define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0)) + +int +vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp) +{ + const fs_operation_trans_def_t *otdp; + fs_generic_func_p *loc = NULL; + vnodeops_t *vop = vn_getops(vp); + + ASSERT(vopname != NULL); + + for (otdp = vn_ops_table; otdp->name != NULL; otdp++) { + if (MATCHNAME(otdp->name, vopname)) { + loc = (fs_generic_func_p *) + ((char *)(vop) + otdp->offset); + break; + } + } + + return ((loc != NULL) && (*loc == funcp)); +} + +/* + * fs_new_caller_id() needs to return a unique ID on a given local system. + * The IDs do not need to survive across reboots. These are primarily + * used so that (FEM) monitors can detect particular callers (such as + * the NFS server) to a given vnode/vfs operation. + */ +u_longlong_t +fs_new_caller_id() +{ + static uint64_t next_caller_id = 0LL; /* First call returns 1 */ + + return ((u_longlong_t)atomic_add_64_nv(&next_caller_id, 1)); +} + +/* + * Given a starting vnode and a path, updates the path in the target vnode in + * a safe manner. If the vnode already has path information embedded, then the + * cached path is left untouched. + */ + +size_t max_vnode_path = 4 * MAXPATHLEN; + +void +vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, + const char *path, size_t plen) +{ + char *rpath; + vnode_t *base; + size_t rpathlen, rpathalloc; + int doslash = 1; + + if (*path == '/') { + base = rootvp; + path++; + plen--; + } else { + base = startvp; + } + + /* + * We cannot grab base->v_lock while we hold vp->v_lock because of + * the potential for deadlock. + */ + mutex_enter(&base->v_lock); + if (base->v_path == NULL) { + mutex_exit(&base->v_lock); + return; + } + + rpathlen = strlen(base->v_path); + rpathalloc = rpathlen + plen + 1; + /* Avoid adding a slash if there's already one there */ + if (base->v_path[rpathlen-1] == '/') + doslash = 0; + else + rpathalloc++; + + /* + * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held, + * so we must do this dance. If, by chance, something changes the path, + * just give up since there is no real harm. + */ + mutex_exit(&base->v_lock); + + /* Paths should stay within reason */ + if (rpathalloc > max_vnode_path) + return; + + rpath = kmem_alloc(rpathalloc, KM_SLEEP); + + mutex_enter(&base->v_lock); + if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { + mutex_exit(&base->v_lock); + kmem_free(rpath, rpathalloc); + return; + } + bcopy(base->v_path, rpath, rpathlen); + mutex_exit(&base->v_lock); + + if (doslash) + rpath[rpathlen++] = '/'; + bcopy(path, rpath + rpathlen, plen); + rpath[rpathlen + plen] = '\0'; + + mutex_enter(&vp->v_lock); + if (vp->v_path != NULL) { + mutex_exit(&vp->v_lock); + kmem_free(rpath, rpathalloc); + } else { + vp->v_path = rpath; + mutex_exit(&vp->v_lock); + } +} + +/* + * Sets the path to the vnode to be the given string, regardless of current + * context. The string must be a complete path from rootdir. This is only used + * by fsop_root() for setting the path based on the mountpoint. + */ +void +vn_setpath_str(struct vnode *vp, const char *str, size_t len) +{ + char *buf = kmem_alloc(len + 1, KM_SLEEP); + + mutex_enter(&vp->v_lock); + if (vp->v_path != NULL) { + mutex_exit(&vp->v_lock); + kmem_free(buf, len + 1); + return; + } + + vp->v_path = buf; + bcopy(str, vp->v_path, len); + vp->v_path[len] = '\0'; + + mutex_exit(&vp->v_lock); +} + +/* + * Called from within filesystem's vop_rename() to handle renames once the + * target vnode is available. + */ +void +vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) +{ + char *tmp; + + mutex_enter(&vp->v_lock); + tmp = vp->v_path; + vp->v_path = NULL; + mutex_exit(&vp->v_lock); + vn_setpath(rootdir, dvp, vp, nm, len); + if (tmp != NULL) + kmem_free(tmp, strlen(tmp) + 1); +} + +/* + * Similar to vn_setpath_str(), this function sets the path of the destination + * vnode to the be the same as the source vnode. + */ +void +vn_copypath(struct vnode *src, struct vnode *dst) +{ + char *buf; + int alloc; + + mutex_enter(&src->v_lock); + if (src->v_path == NULL) { + mutex_exit(&src->v_lock); + return; + } + alloc = strlen(src->v_path) + 1; + + /* avoid kmem_alloc() with lock held */ + mutex_exit(&src->v_lock); + buf = kmem_alloc(alloc, KM_SLEEP); + mutex_enter(&src->v_lock); + if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { + mutex_exit(&src->v_lock); + kmem_free(buf, alloc); + return; + } + bcopy(src->v_path, buf, alloc); + mutex_exit(&src->v_lock); + + mutex_enter(&dst->v_lock); + if (dst->v_path != NULL) { + mutex_exit(&dst->v_lock); + kmem_free(buf, alloc); + return; + } + dst->v_path = buf; + mutex_exit(&dst->v_lock); +} + +/* + * XXX Private interface for segvn routines that handle vnode + * large page segments. + * + * return 1 if vp's file system VOP_PAGEIO() implementation + * can be safely used instead of VOP_GETPAGE() for handling + * pagefaults against regular non swap files. VOP_PAGEIO() + * interface is considered safe here if its implementation + * is very close to VOP_GETPAGE() implementation. + * e.g. It zero's out the part of the page beyond EOF. Doesn't + * panic if there're file holes but instead returns an error. + * Doesn't assume file won't be changed by user writes, etc. + * + * return 0 otherwise. + * + * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs. + */ +int +vn_vmpss_usepageio(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + char *fsname = vfssw[vfsp->vfs_fstype].vsw_name; + char *pageio_ok_fss[] = {"ufs", "nfs", NULL}; + char **fsok = pageio_ok_fss; + + if (fsname == NULL) { + return (0); + } + + for (; *fsok; fsok++) { + if (strcmp(*fsok, fsname) == 0) { + return (1); + } + } + return (0); +} + +/* VOP_XXX() macros call the corresponding fop_xxx() function */ + +int +fop_open( + vnode_t **vpp, + int mode, + cred_t *cr, + caller_context_t *ct) +{ + int ret; + vnode_t *vp = *vpp; + + VN_HOLD(vp); + /* + * Adding to the vnode counts before calling open + * avoids the need for a mutex. It circumvents a race + * condition where a query made on the vnode counts results in a + * false negative. The inquirer goes away believing the file is + * not open when there is an open on the file already under way. + * + * The counts are meant to prevent NFS from granting a delegation + * when it would be dangerous to do so. + * + * The vnode counts are only kept on regular files + */ + if ((*vpp)->v_type == VREG) { + if (mode & FREAD) + atomic_add_32(&((*vpp)->v_rdcnt), 1); + if (mode & FWRITE) + atomic_add_32(&((*vpp)->v_wrcnt), 1); + } + + VOPXID_MAP_CR(vp, cr); + + ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct); + + if (ret) { + /* + * Use the saved vp just in case the vnode ptr got trashed + * by the error. + */ + VOPSTATS_UPDATE(vp, open); + if ((vp->v_type == VREG) && (mode & FREAD)) + atomic_add_32(&(vp->v_rdcnt), -1); + if ((vp->v_type == VREG) && (mode & FWRITE)) + atomic_add_32(&(vp->v_wrcnt), -1); + } else { + /* + * Some filesystems will return a different vnode, + * but the same path was still used to open it. + * So if we do change the vnode and need to + * copy over the path, do so here, rather than special + * casing each filesystem. Adjust the vnode counts to + * reflect the vnode switch. + */ + VOPSTATS_UPDATE(*vpp, open); + if (*vpp != vp && *vpp != NULL) { + vn_copypath(vp, *vpp); + if (((*vpp)->v_type == VREG) && (mode & FREAD)) + atomic_add_32(&((*vpp)->v_rdcnt), 1); + if ((vp->v_type == VREG) && (mode & FREAD)) + atomic_add_32(&(vp->v_rdcnt), -1); + if (((*vpp)->v_type == VREG) && (mode & FWRITE)) + atomic_add_32(&((*vpp)->v_wrcnt), 1); + if ((vp->v_type == VREG) && (mode & FWRITE)) + atomic_add_32(&(vp->v_wrcnt), -1); + } + } + VN_RELE(vp); + return (ret); +} + +int +fop_close( + vnode_t *vp, + int flag, + int count, + offset_t offset, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct); + VOPSTATS_UPDATE(vp, close); + /* + * Check passed in count to handle possible dups. Vnode counts are only + * kept on regular files + */ + if ((vp->v_type == VREG) && (count == 1)) { + if (flag & FREAD) { + ASSERT(vp->v_rdcnt > 0); + atomic_add_32(&(vp->v_rdcnt), -1); + } + if (flag & FWRITE) { + ASSERT(vp->v_wrcnt > 0); + atomic_add_32(&(vp->v_wrcnt), -1); + } + } + return (err); +} + +int +fop_read( + vnode_t *vp, + uio_t *uiop, + int ioflag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + ssize_t resid_start = uiop->uio_resid; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); + VOPSTATS_UPDATE_IO(vp, read, + read_bytes, (resid_start - uiop->uio_resid)); + return (err); +} + +int +fop_write( + vnode_t *vp, + uio_t *uiop, + int ioflag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + ssize_t resid_start = uiop->uio_resid; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); + VOPSTATS_UPDATE_IO(vp, write, + write_bytes, (resid_start - uiop->uio_resid)); + return (err); +} + +int +fop_ioctl( + vnode_t *vp, + int cmd, + intptr_t arg, + int flag, + cred_t *cr, + int *rvalp, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct); + VOPSTATS_UPDATE(vp, ioctl); + return (err); +} + +int +fop_setfl( + vnode_t *vp, + int oflags, + int nflags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct); + VOPSTATS_UPDATE(vp, setfl); + return (err); +} + +int +fop_getattr( + vnode_t *vp, + vattr_t *vap, + int flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + /* + * If this file system doesn't understand the xvattr extensions + * then turn off the xvattr bit. + */ + if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { + vap->va_mask &= ~AT_XVATTR; + } + + /* + * We're only allowed to skip the ACL check iff we used a 32 bit + * ACE mask with VOP_ACCESS() to determine permissions. + */ + if ((flags & ATTR_NOACLCHECK) && + vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { + return (EINVAL); + } + err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct); + VOPSTATS_UPDATE(vp, getattr); + return (err); +} + +int +fop_setattr( + vnode_t *vp, + vattr_t *vap, + int flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + /* + * If this file system doesn't understand the xvattr extensions + * then turn off the xvattr bit. + */ + if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { + vap->va_mask &= ~AT_XVATTR; + } + + /* + * We're only allowed to skip the ACL check iff we used a 32 bit + * ACE mask with VOP_ACCESS() to determine permissions. + */ + if ((flags & ATTR_NOACLCHECK) && + vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { + return (EINVAL); + } + err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct); + VOPSTATS_UPDATE(vp, setattr); + return (err); +} + +int +fop_access( + vnode_t *vp, + int mode, + int flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + if ((flags & V_ACE_MASK) && + vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { + return (EINVAL); + } + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct); + VOPSTATS_UPDATE(vp, access); + return (err); +} + +int +fop_lookup( + vnode_t *dvp, + char *nm, + vnode_t **vpp, + pathname_t *pnp, + int flags, + vnode_t *rdir, + cred_t *cr, + caller_context_t *ct, + int *deflags, /* Returned per-dirent flags */ + pathname_t *ppnp) /* Returned case-preserved name in directory */ +{ + int ret; + + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. It is required + * that if the vfs supports case-insensitive lookup, it also + * supports extended dirent flags. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) { + ret = xattr_dir_lookup(dvp, vpp, flags, cr); + } else { + ret = (*(dvp)->v_op->vop_lookup) + (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp); + } + if (ret == 0 && *vpp) { + VOPSTATS_UPDATE(*vpp, lookup); + if ((*vpp)->v_path == NULL) { + vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); + } + } + + return (ret); +} + +int +fop_create( + vnode_t *dvp, + char *name, + vattr_t *vap, + vcexcl_t excl, + int mode, + vnode_t **vpp, + cred_t *cr, + int flags, + caller_context_t *ct, + vsecattr_t *vsecp) /* ACL to set during create */ +{ + int ret; + + if (vsecp != NULL && + vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { + return (EINVAL); + } + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + ret = (*(dvp)->v_op->vop_create) + (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); + if (ret == 0 && *vpp) { + VOPSTATS_UPDATE(*vpp, create); + if ((*vpp)->v_path == NULL) { + vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); + } + } + + return (ret); +} + +int +fop_remove( + vnode_t *dvp, + char *nm, + cred_t *cr, + caller_context_t *ct, + int flags) +{ + int err; + + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags); + VOPSTATS_UPDATE(dvp, remove); + return (err); +} + +int +fop_link( + vnode_t *tdvp, + vnode_t *svp, + char *tnm, + cred_t *cr, + caller_context_t *ct, + int flags) +{ + int err; + + /* + * If the target file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(tdvp, cr); + + err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags); + VOPSTATS_UPDATE(tdvp, link); + return (err); +} + +int +fop_rename( + vnode_t *sdvp, + char *snm, + vnode_t *tdvp, + char *tnm, + cred_t *cr, + caller_context_t *ct, + int flags) +{ + int err; + + /* + * If the file system involved does not support + * case-insensitive access and said access is requested, fail + * quickly. + */ + if (flags & FIGNORECASE && + ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))) + return (EINVAL); + + VOPXID_MAP_CR(tdvp, cr); + + err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags); + VOPSTATS_UPDATE(sdvp, rename); + return (err); +} + +int +fop_mkdir( + vnode_t *dvp, + char *dirname, + vattr_t *vap, + vnode_t **vpp, + cred_t *cr, + caller_context_t *ct, + int flags, + vsecattr_t *vsecp) /* ACL to set during create */ +{ + int ret; + + if (vsecp != NULL && + vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { + return (EINVAL); + } + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + ret = (*(dvp)->v_op->vop_mkdir) + (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); + if (ret == 0 && *vpp) { + VOPSTATS_UPDATE(*vpp, mkdir); + if ((*vpp)->v_path == NULL) { + vn_setpath(rootdir, dvp, *vpp, dirname, + strlen(dirname)); + } + } + + return (ret); +} + +int +fop_rmdir( + vnode_t *dvp, + char *nm, + vnode_t *cdir, + cred_t *cr, + caller_context_t *ct, + int flags) +{ + int err; + + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags); + VOPSTATS_UPDATE(dvp, rmdir); + return (err); +} + +int +fop_readdir( + vnode_t *vp, + uio_t *uiop, + cred_t *cr, + int *eofp, + caller_context_t *ct, + int flags) +{ + int err; + ssize_t resid_start = uiop->uio_resid; + + /* + * If this file system doesn't support retrieving directory + * entry flags and said access is requested, fail quickly. + */ + if (flags & V_RDDIR_ENTFLAGS && + vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0) + return (EINVAL); + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags); + VOPSTATS_UPDATE_IO(vp, readdir, + readdir_bytes, (resid_start - uiop->uio_resid)); + return (err); +} + +int +fop_symlink( + vnode_t *dvp, + char *linkname, + vattr_t *vap, + char *target, + cred_t *cr, + caller_context_t *ct, + int flags) +{ + int err; + xvattr_t xvattr; + + /* + * If this file system doesn't support case-insensitive access + * and said access is requested, fail quickly. + */ + if (flags & FIGNORECASE && + (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && + vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) + return (EINVAL); + + VOPXID_MAP_CR(dvp, cr); + + /* check for reparse point */ + if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) && + (strncmp(target, FS_REPARSE_TAG_STR, + strlen(FS_REPARSE_TAG_STR)) == 0)) { + if (!fs_reparse_mark(target, vap, &xvattr)) + vap = (vattr_t *)&xvattr; + } + + err = (*(dvp)->v_op->vop_symlink) + (dvp, linkname, vap, target, cr, ct, flags); + VOPSTATS_UPDATE(dvp, symlink); + return (err); +} + +int +fop_readlink( + vnode_t *vp, + uio_t *uiop, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct); + VOPSTATS_UPDATE(vp, readlink); + return (err); +} + +int +fop_fsync( + vnode_t *vp, + int syncflag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct); + VOPSTATS_UPDATE(vp, fsync); + return (err); +} + +void +fop_inactive( + vnode_t *vp, + cred_t *cr, + caller_context_t *ct) +{ + /* Need to update stats before vop call since we may lose the vnode */ + VOPSTATS_UPDATE(vp, inactive); + + VOPXID_MAP_CR(vp, cr); + + (*(vp)->v_op->vop_inactive)(vp, cr, ct); +} + +int +fop_fid( + vnode_t *vp, + fid_t *fidp, + caller_context_t *ct) +{ + int err; + + err = (*(vp)->v_op->vop_fid)(vp, fidp, ct); + VOPSTATS_UPDATE(vp, fid); + return (err); +} + +int +fop_rwlock( + vnode_t *vp, + int write_lock, + caller_context_t *ct) +{ + int ret; + + ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct)); + VOPSTATS_UPDATE(vp, rwlock); + return (ret); +} + +void +fop_rwunlock( + vnode_t *vp, + int write_lock, + caller_context_t *ct) +{ + (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct); + VOPSTATS_UPDATE(vp, rwunlock); +} + +int +fop_seek( + vnode_t *vp, + offset_t ooff, + offset_t *noffp, + caller_context_t *ct) +{ + int err; + + err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct); + VOPSTATS_UPDATE(vp, seek); + return (err); +} + +int +fop_cmp( + vnode_t *vp1, + vnode_t *vp2, + caller_context_t *ct) +{ + int err; + + err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct); + VOPSTATS_UPDATE(vp1, cmp); + return (err); +} + +int +fop_frlock( + vnode_t *vp, + int cmd, + flock64_t *bfp, + int flag, + offset_t offset, + struct flk_callback *flk_cbp, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_frlock) + (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); + VOPSTATS_UPDATE(vp, frlock); + return (err); +} + +int +fop_space( + vnode_t *vp, + int cmd, + flock64_t *bfp, + int flag, + offset_t offset, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct); + VOPSTATS_UPDATE(vp, space); + return (err); +} + +int +fop_realvp( + vnode_t *vp, + vnode_t **vpp, + caller_context_t *ct) +{ + int err; + + err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct); + VOPSTATS_UPDATE(vp, realvp); + return (err); +} + +int +fop_getpage( + vnode_t *vp, + offset_t off, + size_t len, + uint_t *protp, + page_t **plarr, + size_t plsz, + struct seg *seg, + caddr_t addr, + enum seg_rw rw, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_getpage) + (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct); + VOPSTATS_UPDATE(vp, getpage); + return (err); +} + +int +fop_putpage( + vnode_t *vp, + offset_t off, + size_t len, + int flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct); + VOPSTATS_UPDATE(vp, putpage); + return (err); +} + +int +fop_map( + vnode_t *vp, + offset_t off, + struct as *as, + caddr_t *addrp, + size_t len, + uchar_t prot, + uchar_t maxprot, + uint_t flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_map) + (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct); + VOPSTATS_UPDATE(vp, map); + return (err); +} + +int +fop_addmap( + vnode_t *vp, + offset_t off, + struct as *as, + caddr_t addr, + size_t len, + uchar_t prot, + uchar_t maxprot, + uint_t flags, + cred_t *cr, + caller_context_t *ct) +{ + int error; + u_longlong_t delta; + + VOPXID_MAP_CR(vp, cr); + + error = (*(vp)->v_op->vop_addmap) + (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); + + if ((!error) && (vp->v_type == VREG)) { + delta = (u_longlong_t)btopr(len); + /* + * If file is declared MAP_PRIVATE, it can't be written back + * even if open for write. Handle as read. + */ + if (flags & MAP_PRIVATE) { + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)delta); + } else { + /* + * atomic_add_64 forces the fetch of a 64 bit value to + * be atomic on 32 bit machines + */ + if (maxprot & PROT_WRITE) + atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), + (int64_t)delta); + if (maxprot & PROT_READ) + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)delta); + if (maxprot & PROT_EXEC) + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)delta); + } + } + VOPSTATS_UPDATE(vp, addmap); + return (error); +} + +int +fop_delmap( + vnode_t *vp, + offset_t off, + struct as *as, + caddr_t addr, + size_t len, + uint_t prot, + uint_t maxprot, + uint_t flags, + cred_t *cr, + caller_context_t *ct) +{ + int error; + u_longlong_t delta; + + VOPXID_MAP_CR(vp, cr); + + error = (*(vp)->v_op->vop_delmap) + (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); + + /* + * NFS calls into delmap twice, the first time + * it simply establishes a callback mechanism and returns EAGAIN + * while the real work is being done upon the second invocation. + * We have to detect this here and only decrement the counts upon + * the second delmap request. + */ + if ((error != EAGAIN) && (vp->v_type == VREG)) { + + delta = (u_longlong_t)btopr(len); + + if (flags & MAP_PRIVATE) { + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)(-delta)); + } else { + /* + * atomic_add_64 forces the fetch of a 64 bit value + * to be atomic on 32 bit machines + */ + if (maxprot & PROT_WRITE) + atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), + (int64_t)(-delta)); + if (maxprot & PROT_READ) + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)(-delta)); + if (maxprot & PROT_EXEC) + atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), + (int64_t)(-delta)); + } + } + VOPSTATS_UPDATE(vp, delmap); + return (error); +} + + +int +fop_poll( + vnode_t *vp, + short events, + int anyyet, + short *reventsp, + struct pollhead **phpp, + caller_context_t *ct) +{ + int err; + + err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct); + VOPSTATS_UPDATE(vp, poll); + return (err); +} + +int +fop_dump( + vnode_t *vp, + caddr_t addr, + offset_t lbdn, + offset_t dblks, + caller_context_t *ct) +{ + int err; + + /* ensure lbdn and dblks can be passed safely to bdev_dump */ + if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks)) + return (EIO); + + err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct); + VOPSTATS_UPDATE(vp, dump); + return (err); +} + +int +fop_pathconf( + vnode_t *vp, + int cmd, + ulong_t *valp, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct); + VOPSTATS_UPDATE(vp, pathconf); + return (err); +} + +int +fop_pageio( + vnode_t *vp, + struct page *pp, + u_offset_t io_off, + size_t io_len, + int flags, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct); + VOPSTATS_UPDATE(vp, pageio); + return (err); +} + +int +fop_dumpctl( + vnode_t *vp, + int action, + offset_t *blkp, + caller_context_t *ct) +{ + int err; + err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct); + VOPSTATS_UPDATE(vp, dumpctl); + return (err); +} + +void +fop_dispose( + vnode_t *vp, + page_t *pp, + int flag, + int dn, + cred_t *cr, + caller_context_t *ct) +{ + /* Must do stats first since it's possible to lose the vnode */ + VOPSTATS_UPDATE(vp, dispose); + + VOPXID_MAP_CR(vp, cr); + + (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct); +} + +int +fop_setsecattr( + vnode_t *vp, + vsecattr_t *vsap, + int flag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + /* + * We're only allowed to skip the ACL check iff we used a 32 bit + * ACE mask with VOP_ACCESS() to determine permissions. + */ + if ((flag & ATTR_NOACLCHECK) && + vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { + return (EINVAL); + } + err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct); + VOPSTATS_UPDATE(vp, setsecattr); + return (err); +} + +int +fop_getsecattr( + vnode_t *vp, + vsecattr_t *vsap, + int flag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + /* + * We're only allowed to skip the ACL check iff we used a 32 bit + * ACE mask with VOP_ACCESS() to determine permissions. + */ + if ((flag & ATTR_NOACLCHECK) && + vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { + return (EINVAL); + } + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct); + VOPSTATS_UPDATE(vp, getsecattr); + return (err); +} + +int +fop_shrlock( + vnode_t *vp, + int cmd, + struct shrlock *shr, + int flag, + cred_t *cr, + caller_context_t *ct) +{ + int err; + + VOPXID_MAP_CR(vp, cr); + + err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct); + VOPSTATS_UPDATE(vp, shrlock); + return (err); +} + +int +fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm, + caller_context_t *ct) +{ + int err; + + err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct); + VOPSTATS_UPDATE(vp, vnevent); + return (err); +} + +int +fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, + caller_context_t *ct) +{ + int err; + + if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) + return (ENOTSUP); + err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); + VOPSTATS_UPDATE(vp, reqzcbuf); + return (err); +} + +int +fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + int err; + + if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) + return (ENOTSUP); + err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); + VOPSTATS_UPDATE(vp, retzcbuf); + return (err); +} + +/* + * Default destructor + * Needed because NULL destructor means that the key is unused + */ +/* ARGSUSED */ +void +vsd_defaultdestructor(void *value) +{} + +/* + * Create a key (index into per vnode array) + * Locks out vsd_create, vsd_destroy, and vsd_free + * May allocate memory with lock held + */ +void +vsd_create(uint_t *keyp, void (*destructor)(void *)) +{ + int i; + uint_t nkeys; + + /* + * if key is allocated, do nothing + */ + mutex_enter(&vsd_lock); + if (*keyp) { + mutex_exit(&vsd_lock); + return; + } + /* + * find an unused key + */ + if (destructor == NULL) + destructor = vsd_defaultdestructor; + + for (i = 0; i < vsd_nkeys; ++i) + if (vsd_destructor[i] == NULL) + break; + + /* + * if no unused keys, increase the size of the destructor array + */ + if (i == vsd_nkeys) { + if ((nkeys = (vsd_nkeys << 1)) == 0) + nkeys = 1; + vsd_destructor = + (void (**)(void *))vsd_realloc((void *)vsd_destructor, + (size_t)(vsd_nkeys * sizeof (void (*)(void *))), + (size_t)(nkeys * sizeof (void (*)(void *)))); + vsd_nkeys = nkeys; + } + + /* + * allocate the next available unused key + */ + vsd_destructor[i] = destructor; + *keyp = i + 1; + + /* create vsd_list, if it doesn't exist */ + if (vsd_list == NULL) { + vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(vsd_list, sizeof (struct vsd_node), + offsetof(struct vsd_node, vs_nodes)); + } + + mutex_exit(&vsd_lock); +} + +/* + * Destroy a key + * + * Assumes that the caller is preventing vsd_set and vsd_get + * Locks out vsd_create, vsd_destroy, and vsd_free + * May free memory with lock held + */ +void +vsd_destroy(uint_t *keyp) +{ + uint_t key; + struct vsd_node *vsd; + + /* + * protect the key namespace and our destructor lists + */ + mutex_enter(&vsd_lock); + key = *keyp; + *keyp = 0; + + ASSERT(key <= vsd_nkeys); + + /* + * if the key is valid + */ + if (key != 0) { + uint_t k = key - 1; + /* + * for every vnode with VSD, call key's destructor + */ + for (vsd = list_head(vsd_list); vsd != NULL; + vsd = list_next(vsd_list, vsd)) { + /* + * no VSD for key in this vnode + */ + if (key > vsd->vs_nkeys) + continue; + /* + * call destructor for key + */ + if (vsd->vs_value[k] && vsd_destructor[k]) + (*vsd_destructor[k])(vsd->vs_value[k]); + /* + * reset value for key + */ + vsd->vs_value[k] = NULL; + } + /* + * actually free the key (NULL destructor == unused) + */ + vsd_destructor[k] = NULL; + } + + mutex_exit(&vsd_lock); +} + +/* + * Quickly return the per vnode value that was stored with the specified key + * Assumes the caller is protecting key from vsd_create and vsd_destroy + * Assumes the caller is holding v_vsd_lock to protect the vsd. + */ +void * +vsd_get(vnode_t *vp, uint_t key) +{ + struct vsd_node *vsd; + + ASSERT(vp != NULL); + ASSERT(mutex_owned(&vp->v_vsd_lock)); + + vsd = vp->v_vsd; + + if (key && vsd != NULL && key <= vsd->vs_nkeys) + return (vsd->vs_value[key - 1]); + return (NULL); +} + +/* + * Set a per vnode value indexed with the specified key + * Assumes the caller is holding v_vsd_lock to protect the vsd. + */ +int +vsd_set(vnode_t *vp, uint_t key, void *value) +{ + struct vsd_node *vsd; + + ASSERT(vp != NULL); + ASSERT(mutex_owned(&vp->v_vsd_lock)); + + if (key == 0) + return (EINVAL); + + vsd = vp->v_vsd; + if (vsd == NULL) + vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP); + + /* + * If the vsd was just allocated, vs_nkeys will be 0, so the following + * code won't happen and we will continue down and allocate space for + * the vs_value array. + * If the caller is replacing one value with another, then it is up + * to the caller to free/rele/destroy the previous value (if needed). + */ + if (key <= vsd->vs_nkeys) { + vsd->vs_value[key - 1] = value; + return (0); + } + + ASSERT(key <= vsd_nkeys); + + if (vsd->vs_nkeys == 0) { + mutex_enter(&vsd_lock); /* lock out vsd_destroy() */ + /* + * Link onto list of all VSD nodes. + */ + list_insert_head(vsd_list, vsd); + mutex_exit(&vsd_lock); + } + + /* + * Allocate vnode local storage and set the value for key + */ + vsd->vs_value = vsd_realloc(vsd->vs_value, + vsd->vs_nkeys * sizeof (void *), + key * sizeof (void *)); + vsd->vs_nkeys = key; + vsd->vs_value[key - 1] = value; + + return (0); +} + +/* + * Called from vn_free() to run the destructor function for each vsd + * Locks out vsd_create and vsd_destroy + * Assumes that the destructor *DOES NOT* use vsd + */ +void +vsd_free(vnode_t *vp) +{ + int i; + struct vsd_node *vsd = vp->v_vsd; + + if (vsd == NULL) + return; + + if (vsd->vs_nkeys == 0) { + kmem_free(vsd, sizeof (*vsd)); + vp->v_vsd = NULL; + return; + } + + /* + * lock out vsd_create and vsd_destroy, call + * the destructor, and mark the value as destroyed. + */ + mutex_enter(&vsd_lock); + + for (i = 0; i < vsd->vs_nkeys; i++) { + if (vsd->vs_value[i] && vsd_destructor[i]) + (*vsd_destructor[i])(vsd->vs_value[i]); + vsd->vs_value[i] = NULL; + } + + /* + * remove from linked list of VSD nodes + */ + list_remove(vsd_list, vsd); + + mutex_exit(&vsd_lock); + + /* + * free up the VSD + */ + kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *)); + kmem_free(vsd, sizeof (struct vsd_node)); + vp->v_vsd = NULL; +} + +/* + * realloc + */ +static void * +vsd_realloc(void *old, size_t osize, size_t nsize) +{ + void *new; + + new = kmem_zalloc(nsize, KM_SLEEP); + if (old) { + bcopy(old, new, osize); + kmem_free(old, osize); + } + return (new); +} + +/* + * Setup the extensible system attribute for creating a reparse point. + * The symlink data 'target' is validated for proper format of a reparse + * string and a check also made to make sure the symlink data does not + * point to an existing file. + * + * return 0 if ok else -1. + */ +static int +fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr) +{ + xoptattr_t *xoap; + + if ((!target) || (!vap) || (!xvattr)) + return (-1); + + /* validate reparse string */ + if (reparse_validate((const char *)target)) + return (-1); + + xva_init(xvattr); + xvattr->xva_vattr = *vap; + xvattr->xva_vattr.va_mask |= AT_XVATTR; + xoap = xva_getxoptattr(xvattr); + ASSERT(xoap); + XVA_SET_REQ(xvattr, XAT_REPARSE); + xoap->xoa_reparse = 1; + + return (0); +} + +/* + * Function to check whether a symlink is a reparse point. + * Return B_TRUE if it is a reparse point, else return B_FALSE + */ +boolean_t +vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + xvattr_t xvattr; + xoptattr_t *xoap; + + if ((vp->v_type != VLNK) || + !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR))) + return (B_FALSE); + + xva_init(&xvattr); + xoap = xva_getxoptattr(&xvattr); + ASSERT(xoap); + XVA_SET_REQ(&xvattr, XAT_REPARSE); + + if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct)) + return (B_FALSE); + + if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) || + (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE)))) + return (B_FALSE); + + return (xoap->xoa_reparse ? B_TRUE : B_FALSE); +} diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c new file mode 100644 index 000000000000..a82718e8bc6e --- /dev/null +++ b/uts/common/fs/zfs/arc.c @@ -0,0 +1,4658 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory pressure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() interface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <vm/anon.h> +#include <sys/fs/swapnode.h> +#include <sys/dnlc.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> +#include <zfs_fletcher.h> + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + +#define ARC_REDUCE_DNLC_PERCENT 3 +uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + +static int arc_dead; + +/* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* + * These tunables are for performance analysis. + */ +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ + kmutex_t arcs_mtx; +} arc_state_t; + +/* The 6 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + kstat_named_t arcstat_recycle_miss; + kstat_named_t arcstat_mutex_miss; + kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "recycle_miss", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)); + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ + +static int arc_no_grow; /* Don't try to grow cache size */ +static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + void *b_thawed; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + uint64_t b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; +}; + +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static arc_buf_hdr_t arc_eviction_hdr; +static void arc_get_data_buf(arc_buf_t *buf); +static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); + +static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 64 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) + +uint64_t zfs_crc64_table[256]; + +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + +static uint64_t +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) +{ + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spa>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + +static arc_buf_hdr_t * +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t i; + + ASSERT(!HDR_IN_HASH_TABLE(buf)); + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + ASSERT(BUF_EMPTY(buf)); + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (!arc_dead) + cv_signal(&arc_reclaim_thr_cv); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + */ + while (hsize * 65536 < physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static void +arc_cksum_verify(arc_buf_t *buf) +{ + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || + (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + +static void +arc_cksum_compute(arc_buf_t *buf, boolean_t force) +{ + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + buf->b_hdr->b_freeze_cksum); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + buf->b_hdr->b_freeze_cksum = NULL; + } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + kmutex_t *hash_lock; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || + buf->b_hdr->b_state == arc_anon); + arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); +} + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc_anon)) { + uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); + mutex_enter(&ab->b_state->arcs_mtx); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(list, ab); + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&ab->b_state->arcs_mtx); + /* remove the prefetch flag if we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = ab->b_state; + + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&state->arcs_list[ab->b_type], ab); + ASSERT(ab->b_datacnt > 0); + atomic_add_64(size, ab->b_size * ab->b_datacnt); + mutex_exit(&state->arcs_mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +{ + arc_state_t *old_state = ab->b_state; + int64_t refcnt = refcount_count(&ab->b_refcnt); + uint64_t from_delta, to_delta; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&old_state->arcs_mtx); + + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&old_state->arcs_list[ab->b_type], ab); + + /* + * If prefetching out of the ghost cache, + * we will have a non-zero datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->arcs_mtx); + } + if (new_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + + if (use_mutex) + mutex_enter(&new_state->arcs_mtx); + + list_insert_head(&new_state->arcs_list[ab->b_type], ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(size, to_delta); + + if (use_mutex) + mutex_exit(&new_state->arcs_mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->arcs_size, to_delta); + if (from_delta) { + ASSERT3U(old_state->arcs_size, >=, from_delta); + atomic_add_64(&old_state->arcs_size, -from_delta); + } + ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = spa_guid(spa); + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + hdr->b_datacnt = 1; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + return (buf); +} + +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(buf->b_data != NULL); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); +} + +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = hdr->b_size; + + ASSERT(hdr->b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + arc_get_data_buf(buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_datacnt += 1; + return (buf); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + /* + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. + */ + mutex_enter(&buf->b_evict_lock); + if (buf->b_data == NULL) { + mutex_exit(&buf->b_evict_lock); + return; + } + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); +} + +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_cksum_verify(buf); + + if (!recycle) { + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); + atomic_add_64(&arc_size, -size); + } + } + if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc_anon); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); + } + ASSERT3U(state->arcs_size, >=, size); + atomic_add_64(&state->arcs_size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + buf->b_next = NULL; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + + if (l2hdr != NULL) { + boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + if (!buflist_held) { + mutex_enter(&l2arc_buflist_mtx); + l2hdr = hdr->b_l2hdr; + } + + if (l2hdr != NULL) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!buflist_held) + mutex_exit(&l2arc_buflist_mtx); + } + + if (!BUF_EMPTY(hdr)) { + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + buf_discard_identity(hdr); + } + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE, FALSE); + hdr->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + } + } + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } + + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + int hashed = hdr->b_state != arc_anon; + + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + arc_buf_destroy(buf, FALSE, TRUE); + } else { + ASSERT(buf == hdr->b_buf); + ASSERT(buf->b_efunc == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + mutex_exit(hash_lock); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) + arc_buf_destroy(buf, FALSE, TRUE); + else + arc_hdr_destroy(hdr); + } +} + +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); + + if (hdr->b_state == arc_anon) { + ASSERT(hdr->b_datacnt == 1); + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT(hdr->b_state != arc_anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, FALSE, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(buf->b_efunc == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + * If the recycle flag is set, then attempt to "recycle" a buffer: + * - look for a buffer to evict that is `bytes' long. + * - return the data block from this buffer rather than freeing it. + * This flag is used by callers that are trying to make space for a + * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. + */ +static void * +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; + kmutex_t *hash_lock; + boolean_t have_lock; + void *stolen = NULL; + + ASSERT(state == arc_mru || state == arc_mfu); + + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || + (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + ddi_get_lbolt() - ab->b_arc_access < + arc_min_prefetch_lifespan)) { + skipped++; + continue; + } + /* "lookahead" for better eviction candidate */ + if (recycle && ab->b_size != bytes && + ab_prev && ab_prev->b_size == bytes) + continue; + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + missed += 1; + break; + } + if (buf->b_data) { + bytes_evicted += ab->b_size; + if (recycle && ab->b_type == type && + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { + stolen = buf->b_data; + recycle = FALSE; + } + } + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + arc_buf_destroy(buf, + buf->b_data == stolen, FALSE); + ab->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, + buf->b_data == stolen, TRUE); + } + } + + if (ab->b_l2hdr) { + ARCSTAT_INCR(arcstat_evict_l2_cached, + ab->b_size); + } else { + if (l2arc_write_eligible(ab->b_spa, ab)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + ab->b_size); + } else { + ARCSTAT_INCR( + arcstat_evict_l2_ineligible, + ab->b_size); + } + } + + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } + if (!have_lock) + mutex_exit(hash_lock); + if (bytes >= 0 && bytes_evicted >= bytes) + break; + } else { + missed += 1; + } + } + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + if (skipped) + ARCSTAT_INCR(arcstat_evict_skip, skipped); + + if (missed) + ARCSTAT_INCR(arcstat_mutex_miss, missed); + + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + + return (stolen); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +{ + arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker = { 0 }; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; + kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint64_t bufs_skipped = 0; + + ASSERT(GHOST_STATE(state)); +top: + mutex_enter(&state->arcs_mtx); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + + hash_lock = HDR_LOCK(ab); + /* caller may be trying to modify this buffer, skip it */ + if (MUTEX_HELD(hash_lock)) + continue; + if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); + ASSERT(ab->b_buf == NULL); + ARCSTAT_BUMP(arcstat_deleted); + bytes_deleted += ab->b_size; + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else + bufs_skipped += 1; + } + mutex_exit(&state->arcs_mtx); + + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + + if (bufs_skipped) { + ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t adjustment, delta; + + /* + * Adjust MRU size + */ + + adjustment = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } + + /* + * Adjust MFU size + */ + + adjustment = arc_size - arc_c; + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } + + /* + * Adjust ghost lists + */ + + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, NULL, delta); + } + + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, NULL, delta); + } +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = NULL; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + + if (buf->b_efunc != NULL) + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + +/* + * Flush all *evictable* data from the cache for the given spa. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(spa_t *spa) +{ + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(spa || arc_eviction_list == NULL); +} + +void +arc_shrink(void) +{ + if (arc_c > arc_c_min) { + uint64_t to_free; + +#ifdef _KERNEL + to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); +#else + to_free = arc_c >> arc_shrink_shift; +#endif + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_c > arc_size) + arc_c = MAX(arc_size, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (arc_size > arc_c) + arc_adjust(); +} + +static int +arc_reclaim_needed(void) +{ + uint64_t extra; + +#ifdef _KERNEL + + if (needfree) + return (1); + + /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; + +#ifdef _KERNEL + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + /* + * An aggressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. + */ + if (strat == ARC_RECLAIM_AGGR) + arc_shrink(); + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); +} + +static void +arc_reclaim_thread(void) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc_no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); + + arc_kmem_reap_now(last_reclaim); + arc_warm = B_TRUE; + + } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { + arc_no_grow = FALSE; + } + + arc_adjust(); + + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + + if (state == arc_l2c_only) + return; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? + 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ + + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + uint64_t delta; + + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? + 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = MIN(mult, 10); + + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); + } + ASSERT((int64_t)arc_p >= 0); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if the cache has reached its limits and eviction is required + * prior to insert. + */ +static int +arc_evict_needed(arc_buf_contents_t type) +{ + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif + + if (arc_reclaim_needed()) + return (1); + + return (arc_size > arc_c); +} + +/* + * The buffer, supplied as the first argument, needs a data block. + * So, if we are at cache max, determine which cache should be victimized. + * We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_get_data_buf(arc_buf_t *buf) +{ + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_adapt(size, state); + + /* + * We have not yet reached cache maximum size, + * just allocate a new buffer. + */ + if (!arc_evict_needed(type)) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); + atomic_add_64(&arc_size, size); + } + goto out; + } + + /* + * If we are prefetching from the mfu ghost list, this buffer + * will end up on the mru list; so steal space from there. + */ + if (state == arc_mfu_ghost) + state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + else if (state == arc_mru_ghost) + state = arc_mru; + + if (state == arc_mru || state == arc_anon) { + uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; + state = (arc_mfu->arcs_lsize[type] >= size && + arc_p > mru_used) ? arc_mfu : arc_mru; + } else { + /* MFU cases */ + uint64_t mfu_space = arc_c - arc_p; + state = (arc_mru->arcs_lsize[type] >= size && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + } + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); + atomic_add_64(&arc_size, size); + } + ARCSTAT_BUMP(arcstat_recycle_miss); + } + ASSERT(buf->b_data != NULL); +out: + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(buf->b_hdr->b_state)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + + atomic_add_64(&hdr->b_state->arcs_size, size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + } + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (arc_size < arc_c && hdr->b_state == arc_anon && + arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + arc_p = MIN(arc_c, arc_p + size); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + clock_t now; + + ASSERT(MUTEX_HELD(hash_lock)); + + if (buf->b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc_mru, buf, hash_lock); + + } else if (buf->b_state == arc_mru) { + now = ddi_get_lbolt(); + + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + } else { + buf->b_flags &= ~ARC_PREFETCH; + ARCSTAT_BUMP(arcstat_mru_hits); + } + buf->b_arc_access = now; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (now > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (buf->b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc_mru; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + } + + buf->b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (buf->b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + } + ARCSTAT_BUMP(arcstat_mfu_hits); + buf->b_arc_access = ddi_get_lbolt(); + } else if (buf->b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc_mru; + } + + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); +} + +/* a generic arc_done_func_t */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + VERIFY(arc_buf_remove_ref(buf, arg) == 1); + *bufp = NULL; + } else { + *bufp = buf; + ASSERT(buf->b_data); + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr, *found; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } + + arc_cksum_compute(buf, B_FALSE); + + if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) + abuf = arc_buf_clone(buf); + acb->acb_buf = abuf; + abuf = NULL; + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) { + ASSERT(buf->b_efunc == NULL); + ASSERT(hdr->b_datacnt == 1); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + + if (hash_lock) { + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. + */ +int +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_data_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + rw_exit(&pbuf->b_data_lock); + + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + uint64_t guid = spa_guid(spa); + +top: + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); + if (hdr && hdr->b_datacnt > 0) { + + *arc_flags |= ARC_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, NULL, zio_flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + if (done) { + add_reference(hdr, hash_lock, private); + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } else { + buf = arc_buf_clone(buf); + } + + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); + + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + vdev_t *vd = NULL; + uint64_t addr; + boolean_t devw = B_FALSE; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + buf_discard_identity(hdr); + (void) arc_buf_remove_ref(buf, private); + goto top; /* restart the IO request */ + } + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; + } else { + /* this block is in the ghost cache */ + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; + arc_get_data_buf(buf); + arc_access(hdr, hash_lock); + } + + ASSERT(!GHOST_STATE(hdr->b_state)); + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, size, zbookmark_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, misses); + + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } + } + + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, zio_flags, zb); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_efunc == NULL); + ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); + + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (0); + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ + /* + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. + */ + buf->b_efunc = NULL; + mutex_exit(&buf->b_evict_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); + } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + arc_buf_destroy(buf, FALSE, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&old_state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&old_state->arcs_mtx); + } + mutex_exit(hash_lock); + mutex_exit(&buf->b_evict_lock); + + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + buf->b_next = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock = NULL; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + + if (hdr->b_state == arc_anon) { + /* this buffer is already released */ + ASSERT(buf->b_efunc == NULL); + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + } + + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + + /* + * Do we have more than one buf? + */ + if (hdr->b_datacnt > 1) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + uint64_t spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; + + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + /* + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. + */ + (void) remove_reference(hdr, hash_lock, tag); + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + buf->b_next = NULL; + + ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); + } + hdr->b_datacnt -= 1; + arc_cksum_verify(buf); + + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_type = type; + nhdr->b_buf = buf; + nhdr->b_state = arc_anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; + nhdr->b_datacnt = 1; + nhdr->b_freeze_cksum = NULL; + (void) refcount_add(&nhdr->b_refcnt, tag); + buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); + atomic_add_64(&arc_anon->arcs_size, blksz); + } else { + mutex_exit(&buf->b_evict_lock); + ASSERT(refcount_count(&hdr->b_refcnt) == 1); + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_arc_access = 0; + if (hash_lock) + mutex_exit(hash_lock); + + buf_discard_identity(hdr); + arc_buf_thaw(buf); + } + buf->b_efunc = NULL; + buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } +} + +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + +int +arc_released(arc_buf_t *buf) +{ + int released; + + mutex_enter(&buf->b_evict_lock); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + mutex_exit(&buf->b_evict_lock); + return (released); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + int callback; + + mutex_enter(&buf->b_evict_lock); + callback = (buf->b_efunc != NULL); + mutex_exit(&buf->b_evict_lock); + return (callback); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + int referenced; + + mutex_enter(&buf->b_evict_lock); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + mutex_exit(&buf->b_evict_lock); + return (referenced); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); + } + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(hdr->b_acb == NULL); + + if (zio->io_error == 0) { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } else { + ASSERT(BUF_EMPTY(hdr)); + } + + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). + */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + ASSERT(zio->io_error == 0); + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else { + /* Dedup */ + ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } + } + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + /* if it's not anon, we are doing a scrub */ + if (!exists && hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + } + + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + + ASSERT(ready != NULL); + ASSERT(done != NULL); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT(hdr->b_acb == NULL); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); + + return (zio); +} + +static int +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t available_memory = ptob(freemem); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == proc_pageout) { + if (page_load > MAX(ptob(minfree), available_memory) / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + +void +arc_tempreserve_clear(uint64_t reserve) +{ + atomic_add_64(&arc_tempreserve, -reserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t reserve, uint64_t txg) +{ + int error; + uint64_t anon_size; + +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) + return (ENOMEM); + + /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, anon_size, txg)) + return (error); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + */ + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, reserve); + return (0); +} + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + + /* Start out with 1/8 of all memory */ + arc_c = physmem * PAGESIZE / 8; + +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif + + /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ + arc_c_min = MAX(arc_c / 4, 64<<20); + /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1<<30) + arc_c_max = (arc_c * 8) - (1<<30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(arc_c * 6, arc_c_max); + + /* + * Allow the tunables to override our calculations if they are + * reasonable (ie. over 64MB) + */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + arc_size = 0; + + mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); + bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + kstat_install(arc_ksp); + } + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(NULL); + + arc_dead = TRUE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + mutex_destroy(&arc_eviction_mtx); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + + mutex_destroy(&arc_anon->arcs_mtx); + mutex_destroy(&arc_mru->arcs_mtx); + mutex_destroy(&arc_mru_ghost->arcs_mtx); + mutex_destroy(&arc_mfu->arcs_mtx); + mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&arc_l2c_only->arcs_mtx); + + mutex_destroy(&zfs_write_limit_lock); + + buf_fini(); + + ASSERT(arc_loaned_bytes == 0); +} + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. + */ + +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + int equal; + + ASSERT(zio->io_vd != NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = EIO; + } + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static uint64_t +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz, headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); + + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (int try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + headroom = target_sz * l2arc_headroom; + if (arc_warm == B_FALSE) + ab = list_head(list); + else + ab = list_tail(list); + + for (; ab; ab = ab_prev) { + if (arc_warm == B_FALSE) + ab_prev = list_next(list, ab); + else + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (!l2arc_write_eligible(guid, ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + /* + * Keep the clock hand suitably device-aligned. + */ + buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return (0); + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + vdev_space_update(dev->l2ad_vdev, + dev->l2ad_end - dev->l2ad_hand, 0, 0); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + dev->l2ad_writing = B_TRUE; + (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + next); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + + /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = l2arc_write_size(dev); + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_boost = l2arc_write_boost; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode_global & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/uts/common/fs/zfs/bplist.c b/uts/common/fs/zfs/bplist.c new file mode 100644 index 000000000000..066ccc6b1e05 --- /dev/null +++ b/uts/common/fs/zfs/bplist.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + + +void +bplist_create(bplist_t *bpl) +{ + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); +} + +void +bplist_destroy(bplist_t *bpl) +{ + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); +} + +void +bplist_append(bplist_t *bpl, const blkptr_t *bp) +{ + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); + + mutex_enter(&bpl->bpl_lock); + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while (bpe = list_head(&bpl->bpl_list)) { + list_remove(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/uts/common/fs/zfs/bpobj.c b/uts/common/fs/zfs/bpobj.c new file mode 100644 index 000000000000..72be31235607 --- /dev/null +++ b/uts/common/fs/zfs/bpobj.c @@ -0,0 +1,495 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/bpobj.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + i++; + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + i * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) { + mutex_exit(&bpo->bpo_lock); + return (err); + } + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err) + break; + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || + bpo->bpo_phys->bpo_bytes == 0); + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (used == 0) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + } + + mutex_enter(&bpo->bpo_lock); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(!BP_IS_HOLE(bp)); + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + /* The bpobj will compress better if we can leave off the checksum */ + if (!BP_GET_DEDUP(bp)) + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + sra->used += bp_get_dsize_sync(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c new file mode 100644 index 000000000000..9c4e0296db2b --- /dev/null +++ b/uts/common/fs/zfs/dbuf.c @@ -0,0 +1,2707 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> + +static void dbuf_destroy(dmu_buf_impl_t *db); +static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_cache; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + refcount_create(&db->db_holds); + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); + + return (crc); +} + +#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid = db->db_blkid; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, 1); + + return (NULL); +} + +/* + * Remove an entry from the hash table. This operation will + * fail if there are any existing holds on the db. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf, **dbp; + + /* + * We musn't hold db_mtx to maintin lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, -1); +} + +static arc_evict_func_t dbuf_do_evict; + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0 || db->db_evict_func == NULL) + return; + + if (db->db_user_data_ptr_ptr) + *db->db_user_data_ptr_ptr = db->db.db_data; + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; +} + +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + +void +dbuf_evict(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); + + dbuf_clear(db); + dbuf_destroy(db); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). + */ + while (hsize * 4096 < physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_cache); +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dbuf_dirty_record_t *dr; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !list_is_empty(&dn->dn_dbufs)); + } + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, 0); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + */ + if (db->db_dirtycnt == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } + } + DB_DNODE_EXIT(db); +} +#endif + +static void +dbuf_update_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && db->db_user_data_ptr_ptr) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_user_data_ptr_ptr = db->db.db_data; + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + db->db_buf = buf; + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; + } +} + +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + spa_t *spa; + + mutex_exit(&db->db_mtx); + DB_GET_SPA(&spa, db); + abuf = arc_loan_buf(spa, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + dbuf_set_data(db, NULL); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + +uint64_t +dbuf_whichblock(dnode_t *dn, uint64_t offset) +{ + if (dn->dn_datablkshift) { + return (offset >> dn->dn_datablkshift); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (db->db_level == 0 && db->db_freed_in_flight) { + /* we were freed in flight; disregard any error */ + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else if (zio == NULL || zio->io_error == 0) { + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + db->db_state = DB_UNCACHED; + } + cv_broadcast(&db->db_changed); + dbuf_rele_and_unlock(db, NULL); +} + +static void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +{ + dnode_t *dn; + spa_t *spa; + zbookmark_t zb; + uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + + if (db->db_blkid == DMU_BONUS_BLKID) { + int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + if (bonuslen < DN_MAX_BONUSLEN) + bzero(db->db.db_data, DN_MAX_BONUSLEN); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); + dbuf_update_data(db); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type)); + DB_DNODE_EXIT(db); + bzero(db->db.db_data, db->db.db_size); + db->db_state = DB_CACHED; + *flags |= DB_RF_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + spa = dn->dn_objset->os_spa; + DB_DNODE_EXIT(db); + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + + SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + dbuf_add_ref(db, NULL); + /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) dsl_read(zio, spa, db->db_blkptr, pbuf, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + int havepzio = (zio != NULL); + int prefetch; + dnode_t *dn; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_NOFILL) + return (EIO); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && + DBUF_IS_CACHEABLE(db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + } else if (db->db_state == DB_UNCACHED) { + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + dbuf_read_impl(db, zio, &flags); + + /* dbuf_read_impl has dropped db_mtx for us */ + + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, flags & DB_RF_CACHED); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + if (!havepzio) + err = zio_wait(zio); + } else { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + mutex_enter(&db->db_mtx); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = EIO; + } + mutex_exit(&db->db_mtx); + } + + ASSERT(err || havepzio || db->db_state == DB_CACHED); + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + db->db_state = DB_FILL; + } else if (db->db_state == DB_NOFILL) { + dbuf_set_data(db, NULL); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +/* + * This is our just-in-time copy function. It makes a copy of + * buffers, that have been modified in a previous transaction + * group, before we modify them in the current active group. + * + * This function is used in two places: when we are dirtying a + * buffer for the first time in a txg, and when we are freeing + * a range in a dnode that includes this buffer. + * + * Note that when we are called from dbuf_free_range() we do + * not put a hold on the buffer, we just traverse the active + * dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; + + DB_GET_SPA(&spa, db); + dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + dbuf_set_data(db, NULL); + } +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + ASSERT(db->db_data_pending != dr); + + /* free this block */ + if (!BP_IS_HOLE(bp)) { + spa_t *spa; + + DB_GET_SPA(&spa, db); + zio_free(spa, txg, bp); + } + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ +void +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; + + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + + if (db->db_level != 0) + continue; + dprintf_dbuf(db, "found buf %s\n", ""); + if (db->db_blkid < start || db->db_blkid > end) + continue; + + /* found a level 0 buffer in the range */ + if (dbuf_undirty(db, tx)) + continue; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_clear(db); + continue; + } + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +static int +dbuf_block_freeable(dmu_buf_impl_t *db) +{ + dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + uint64_t birth_txg = 0; + + /* + * We don't need any locking to protect db_blkptr: + * If it's syncing, then db_last_dirty will be set + * so we'll ignore db_blkptr. + */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_last_dirty) + birth_txg = db->db_last_dirty->dr_txg; + else if (db->db_blkptr) + birth_txg = db->db_blkptr->blk_birth; + + /* + * If we don't exist or are in a snapshot, we can't be freed. + * Don't pass the bp to dsl_dataset_block_freeable() since we + * are holding the db_mtx lock and might deadlock if we are + * prefetching a dedup-ed block. + */ + if (birth_txg) + return (ds == NULL || + dsl_dataset_block_freeable(ds, NULL, birth_txg)); + else + return (FALSE); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dbuf_will_dirty(db, tx); + + /* create the data buffer for the new block */ + buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + VERIFY(arc_buf_remove_ref(obuf, db) == 1); + db->db.db_size = size; + + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } + mutex_exit(&db->db_mtx); + + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os; + zbookmark_t zb; + + DB_GET_OBJSET(&os, db); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + zb.zb_objset = os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_release_bp(db->db_buf, db, + db->db_blkptr, os->os_spa, &zb); +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + objset_t *os; + dbuf_dirty_record_t **drp, *dr; + int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED && + !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&dn->dn_mtx); + + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + + /* + * If this buffer is already dirty, we're done. + */ + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) + arc_buf_thaw(db->db_buf); + } + mutex_exit(&db->db_mtx); + return (dr); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + os = dn->dn_objset; + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dsize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dsize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + ddt_prefetch(os->os_spa, bp); + dnode_willuse_space(dn, -willfree, tx); + } + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); + parent_held = TRUE; + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); +} + +static int +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr, **drp; + + ASSERT(txg != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + mutex_enter(&db->db_mtx); + /* + * If this buffer is not dirty, we're done. + */ + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) { + mutex_exit(&db->db_mtx); + return (0); + } + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* + * If this buffer is currently held, we cannot undirty + * it, since one of the current holders may be in the + * middle of an update. Note that users of dbuf_undirty() + * should not place a hold on the dbuf before the call. + */ + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + /* Make sure we don't toss this buffer at sync phase */ + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + DB_DNODE_EXIT(db); + return (0); + } + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + /* XXX would be nice to fix up dn_towrite_space[] */ + + *drp = dr->dr_next; + + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_level+1 == dn->dn_nlevels) { + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + DB_DNODE_EXIT(db); + + if (db->db_level == 0) { + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + } + } else { + ASSERT(db->db_buf != NULL); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + arc_buf_t *buf = db->db_buf; + + ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + return (1); + } + + mutex_exit(&db->db_mtx); + return (0); +} + +#pragma weak dmu_buf_will_dirty = dbuf_will_dirty +void +dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); + (void) dbuf_read(db, NULL, rf); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + + dmu_buf_will_fill(db_fake, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +/* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT(buf != NULL); + ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + xuio_stat_wbuf_copied(); + return; + } + + xuio_stat_wbuf_nocopy(); + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dbuf_fill_done(db, tx); +} + +/* + * "Clear" the contents of this dbuf. This will mark the dbuf + * EVICTING and clear *most* of its references. Unfortunetely, + * when we are not holding the dn_dbufs_mtx, we can't clear the + * entry in the dn_dbufs list. We have to wait until dbuf_destroy() + * in this case. For callers from the DMU we will usually see: + * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * For the arc callback, we will usually see: + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * Sometimes, though, we will get a mix of these two: + * DMU: dbuf_clear()->arc_buf_evict() + * ARC: dbuf_do_evict()->dbuf_destroy() + */ +void +dbuf_clear(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb; + int dbuf_gone = FALSE; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + dbuf_evict_user(db); + + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + if (db->db_blkid == DMU_BONUS_BLKID) { + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } + + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_data_pending == NULL); + + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ + dnode_rele(dn, db); + db->db_dnode_handle = NULL; + } else { + DB_DNODE_EXIT(db); + } + + if (db->db_buf) + dbuf_gone = arc_buf_evict(db->db_buf); + + if (!dbuf_gone) + mutex_exit(&db->db_mtx); + + /* + * If this dbuf is referenced from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) + dbuf_rele(parent, db); +} + +static int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + int nlevels, epbs; + + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = &dn->dn_phys->dn_spill; + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } + + if (dn->dn_phys->dn_nlevels == 0) + nlevels = 1; + else + nlevels = dn->dn_phys->dn_nlevels; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (level >= nlevels || + (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (ENOENT); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, NULL, parentp); + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_last_dirty = NULL; + db->db_dirtycnt = 0; + db->db_dnode_handle = dn->dn_handle; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; + db->db_immediate_evict = 0; + db->db_freed_in_flight = 0; + + if (blkid == DMU_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + db->db.db_offset = DMU_BONUS_BLKID; + db->db_state = DB_UNCACHED; + /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; + } else { + int blocksize = + db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + return (odb); + } + list_insert_head(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; + mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, db); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +static int +dbuf_do_evict(void *private) +{ + arc_buf_t *buf = private; + dmu_buf_impl_t *db = buf->b_private; + + if (!MUTEX_HELD(&db->db_mtx)) + mutex_enter(&db->db_mtx); + + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_state != DB_EVICTING) { + ASSERT(db->db_state == DB_CACHED); + DBUF_VERIFY(db); + db->db_buf = NULL; + dbuf_evict(db); + } else { + mutex_exit(&db->db_mtx); + dbuf_destroy(db); + } + return (0); +} + +static void +dbuf_destroy(dmu_buf_impl_t *db) +{ + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this dbuf is still on the dn_dbufs list, + * remove it from that list. + */ + if (db->db_dnode_handle != NULL) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + mutex_enter(&dn->dn_dbufs_mtx); + list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + mutex_exit(&dn->dn_dbufs_mtx); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold + * corresponding to the removed dbuf is no longer + * discounted in dnode_move(), so the dnode cannot be + * moved until after we release the hold. + */ + dnode_rele(dn, db); + db->db_dnode_handle = NULL; + } + dbuf_hash_remove(db); + } + db->db_parent = NULL; + db->db_buf = NULL; + + ASSERT(!list_link_active(&db->db_link)); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + + kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); +} + +void +dbuf_prefetch(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db = NULL; + blkptr_t *bp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (dnode_block_freed(dn, blkid)) + return; + + /* dbuf_find() returns with db_mtx held */ + if (db = dbuf_find(dn, 0, blkid)) { + /* + * This dbuf is already in the cache. We assume that + * it is already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } + + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { + if (bp && !BP_IS_HOLE(bp)) { + int priority = dn->dn_type == DMU_OT_DDT_ZAP ? + ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; + arc_buf_t *pbuf; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, 0, blkid); + + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) dsl_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, priority, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + if (db) + dbuf_rele(db, NULL); + } +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; +top: + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = ENOENT; + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } + + if (db->db_buf && refcount_is_zero(&db->db_holds)) { + arc_buf_add_ref(db->db_buf, db); + if (db->db_buf->b_data == NULL) { + dbuf_clear(db); + if (parent) { + dbuf_rele(parent, NULL); + parent = NULL; + } + goto top; + } + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + + dbuf_set_data(db, + arc_buf_alloc(dn->dn_objset->os_spa, + db->db.db_size, db, type)); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); + } + } + + (void) refcount_add(&db->db_holds, tag); + dbuf_update_data(db); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(DB_DNODE(db), ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (err ? NULL : db); +} + +void +dbuf_create_bonus(dnode_t *dn) +{ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (ENOTSUP); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + if (blksz > SPA_MAXBLOCKSIZE) + blksz = SPA_MAXBLOCKSIZE; + else + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT(holds > 1); +} + +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ +#pragma weak dmu_buf_rele = dbuf_rele +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + DBUF_VERIFY(db); + + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ + holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + arc_buf_freeze(db->db_buf); + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DMU_BONUS_BLKID) { + mutex_exit(&db->db_mtx); + + /* + * If the dnode moves here, we cannot cross this barrier + * until the move completes. + */ + DB_DNODE_ENTER(db); + (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); + DB_DNODE_EXIT(db); + /* + * The bonus buffer's dnode hold is no longer discounted + * in dnode_move(). The dnode cannot move until after + * the dnode_rele(). + */ + dnode_rele(DB_DNODE(db), db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + dbuf_evict(db); + } else if (arc_released(db->db_buf)) { + arc_buf_t *buf = db->db_buf; + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + } else { + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); + } + } else { + mutex_exit(&db->db_mtx); + } +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_evict_func = evict_func; + + dbuf_update_data(db); + } else { + old_user_ptr = db->db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_user_ptr); +} + +boolean_t +dmu_buf_freeable(dmu_buf_t *dbuf) +{ + boolean_t res = B_FALSE; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + + if (db->db_blkptr) + res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, + db->db_blkptr, db->db_blkptr->blk_birth); + + return (res); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = &dn->dn_phys->dn_spill; + BP_ZERO(db->db_blkptr); + return; + } + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, db, &parent); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_buf != NULL); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); + + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + dbuf_write(dr, db->db_buf, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + uint64_t txg = tx->tx_txg; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + } + DBUF_VERIFY(db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_dirty_record_t **drp; + + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + + if (*datap != db->db.db_data) { + zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + } + db->db_data_pending = NULL; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT(dr->dr_next == NULL); + ASSERT(dr->dr_dbuf == db); + *drp = dr->dr_next; + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + return; + } + + os = dn->dn_objset; + + /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* + * If this buffer is in the middle of an immediate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + int blksz = arc_buf_size(*datap); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); + } + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, *datap, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) { + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); + zio_nowait(dr->dr_zio); + } +} + +void +dbuf_sync_list(list_t *list, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + spa_t *spa = zio->io_spa; + int64_t delta; + uint64_t fill = 0; + int i; + + ASSERT(db->db_blkptr == bp); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; + + if (BP_IS_HOLE(bp)) { + ASSERT(bp->blk_fill == 0); + DB_DNODE_EXIT(db); + return; + } + + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + + mutex_enter(&db->db_mtx); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + dnode_phys_t *dnp = db->db.db_data; + for (i = db->db.db_size >> DNODE_SHIFT; i > 0; + i--, dnp++) { + if (dnp->dn_type != DMU_OT_NONE) + fill++; + } + } else { + fill = 1; + } + } else { + blkptr_t *ibp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) + continue; + fill += ibp->blk_fill; + } + } + DB_DNODE_EXIT(db); + + bp->blk_fill = fill; + + mutex_exit(&db->db_mtx); +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint64_t txg = zio->io_txg; + dbuf_dirty_record_t **drp, *dr; + + ASSERT3U(zio->io_error, ==, 0); + ASSERT(db->db_blkptr == bp); + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + objset_t *os; + dsl_dataset_t *ds; + dmu_tx_t *tx; + + DB_GET_OBJSET(&os, db); + ds = os->os_dsl_dataset; + tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + + mutex_enter(&db->db_mtx); + + DBUF_VERIFY(db); + + drp = &db->db_last_dirty; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } +#endif + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, + db) == 1); + else if (!arc_released(db->db_buf)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid + >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + DB_DNODE_EXIT(db); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } + mutex_exit(&db->db_mtx); + + dbuf_write_done(zio, NULL, db); +} + +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_prop_t zp; + zio_t *zio; + int wp_flag = 0; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + dbuf_release_bp(db); + } + } + } + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + ASSERT(db->db_state != DB_NOFILL); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + dbuf_write_override_ready, dbuf_write_override_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, NULL, db->db.db_size, &zp, + dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + dr->dr_zio = arc_write(zio, os->os_spa, txg, + db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, + dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } +} diff --git a/uts/common/fs/zfs/ddt.c b/uts/common/fs/zfs/ddt.c new file mode 100644 index 000000000000..718331496765 --- /dev/null +++ b/uts/common/fs/zfs/ddt.c @@ -0,0 +1,1146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dsl_scan.h> + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_object_count(ddt, type, class) == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(error == 0); + return (error); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +uint64_t +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class])); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +int +ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) +{ + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); +} + +int +ddt_ditto_copies_present(ddt_entry_t *dde) +{ + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) +{ + return (spa->spa_ddt[c]); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_data != NULL) + zio_buf_free(dde->dde_repair_data, + DDK_GET_PSIZE(&dde->dde_key)); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) + break; + } + if (error != ENOENT) + break; + } + + ASSERT(error == 0 || error == ENOENT); + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; + + for (int i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object, tx) == 0); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + count += ddt_object_count(ddt, type, class); + } + } + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (count == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + zio_t *rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); +} diff --git a/uts/common/fs/zfs/ddt_zap.c b/uts/common/fs/zfs/ddt_zap.c new file mode 100644 index 000000000000..d6a991c7c19e --- /dev/null +++ b/uts/common/fs/zfs/ddt_zap.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <util/sscanf.h> + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? ENOTSUP : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t one, csize; + int error; + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + return (error); + + ASSERT(one == 1); + ASSERT(csize <= sizeof (cbuf)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + return (error); + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + + return (0); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + zap_cursor_init_serialized(&zc, os, object, *walk); + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static uint64_t +ddt_zap_count(objset_t *os, uint64_t object) +{ + uint64_t count = 0; + + VERIFY(zap_count(os, object, &count) == 0); + + return (count); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_prefetch, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c new file mode 100644 index 000000000000..39234eba53b2 --- /dev/null +++ b/uts/common/fs/zfs/dmu.c @@ -0,0 +1,1764 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_prop.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/sa.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <sys/zfs_znode.h> +#endif + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { byteswap_uint8_array, TRUE, "unallocated" }, + { zap_byteswap, TRUE, "object directory" }, + { byteswap_uint64_array, TRUE, "object array" }, + { byteswap_uint8_array, TRUE, "packed nvlist" }, + { byteswap_uint64_array, TRUE, "packed nvlist size" }, + { byteswap_uint64_array, TRUE, "bpobj" }, + { byteswap_uint64_array, TRUE, "bpobj header" }, + { byteswap_uint64_array, TRUE, "SPA space map header" }, + { byteswap_uint64_array, TRUE, "SPA space map" }, + { byteswap_uint64_array, TRUE, "ZIL intent log" }, + { dnode_buf_byteswap, TRUE, "DMU dnode" }, + { dmu_objset_byteswap, TRUE, "DMU objset" }, + { byteswap_uint64_array, TRUE, "DSL directory" }, + { zap_byteswap, TRUE, "DSL directory child map"}, + { zap_byteswap, TRUE, "DSL dataset snap map" }, + { zap_byteswap, TRUE, "DSL props" }, + { byteswap_uint64_array, TRUE, "DSL dataset" }, + { zfs_znode_byteswap, TRUE, "ZFS znode" }, + { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, + { byteswap_uint8_array, FALSE, "ZFS plain file" }, + { zap_byteswap, TRUE, "ZFS directory" }, + { zap_byteswap, TRUE, "ZFS master node" }, + { zap_byteswap, TRUE, "ZFS delete queue" }, + { byteswap_uint8_array, FALSE, "zvol object" }, + { zap_byteswap, TRUE, "zvol prop" }, + { byteswap_uint8_array, FALSE, "other uint8[]" }, + { byteswap_uint64_array, FALSE, "other uint64[]" }, + { zap_byteswap, TRUE, "other ZAP" }, + { zap_byteswap, TRUE, "persistent error log" }, + { byteswap_uint8_array, TRUE, "SPA history" }, + { byteswap_uint64_array, TRUE, "SPA history offsets" }, + { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "DSL permissions" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, + { byteswap_uint8_array, TRUE, "FUID table" }, + { byteswap_uint64_array, TRUE, "FUID table size" }, + { zap_byteswap, TRUE, "DSL dataset next clones"}, + { zap_byteswap, TRUE, "scan work queue" }, + { zap_byteswap, TRUE, "ZFS user/group used" }, + { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, + { zap_byteswap, TRUE, "DDT ZAP algorithm" }, + { zap_byteswap, TRUE, "DDT statistics" }, + { byteswap_uint8_array, TRUE, "System attributes" }, + { zap_byteswap, TRUE, "SA master node" }, + { zap_byteswap, TRUE, "SA attr registration" }, + { zap_byteswap, TRUE, "SA attr layouts" }, + { zap_byteswap, TRUE, "scan translations" }, + { byteswap_uint8_array, FALSE, "deduplicated block" }, + { zap_byteswap, TRUE, "DSL deadlist map" }, + { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, + { zap_byteswap, TRUE, "DSL dir clones" }, + { byteswap_uint64_array, TRUE, "bpobj subobj" }, +}; + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + blkid = dbuf_whichblock(dn, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) { + err = EIO; + } else { + err = dbuf_read(db, NULL, db_flags); + if (err) { + dbuf_rele(db, tag); + db = NULL; + } + } + + dnode_rele(dn, FTAG); + *dbp = &db->db; /* NULL db plus first field offset is NULL */ + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_MAX_BONUSLEN); +} + +int +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = EINVAL; + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = EINVAL; + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (type > DMU_OT_NUMTYPES) { + error = EINVAL; + } else if (dn->dn_bonus != db) { + error = EINVAL; + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * returns ENOENT, EIO, or 0. + */ +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + dmu_buf_impl_t *db; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) { + VERIFY(dnode_add_ref(dn, db)); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); + + *dbp = &db->db; + return (0); +} + +/* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = EINVAL; + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = ENOENT; + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +static int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) +{ + dsl_pool_t *dp = NULL; + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t dbuf_flags; + int err; + zio_t *zio; + hrtime_t start; + + ASSERT(length <= DMU_MAX_ACCESS); + + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - + P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); + return (EIO); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (EIO); + } + /* initiate async i/o */ + if (read) { + (void) dbuf_read(db, zio, dbuf_flags); + } + dbp[i] = &db->db; + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) < |